# This project is the crime project created by Yu Qiushi, il la tout fait!import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import altair as alt
import holoviews as hv
import seaborn as sns
import hvplot.pandas
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable
import geoviews as gv
import geoviews.tile_sources as gvts
import folium
import xyzservices
from folium.plugins import TimestampedGeoJson
import osmnx as ox
import pandana as pnda
from pandana.loaders import osm
import requests
from bs4 import BeautifulSoup
%matplotlib inline
from folium import GeoJson
import panel as pn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="the convert_dtype parameter is deprecated")philadelphia_range_path = "final_data/city_limits_3857.geojson"
city_limits = gpd.read_file(philadelphia_range_path)
if city_limits.crs.to_string() != "EPSG:4326":
city_limits = city_limits.to_crs(epsg=4326)
print(city_limits.crs)EPSG:4326
city_limits.explore()Make this Notebook Trusted to load map: File -> Trust Notebook
base_url = "https://phl.carto.com/api/v2/sql"
sql_query = """
SELECT * FROM incidents_part1_part2
WHERE dispatch_date >= '2022-01-01' AND dispatch_date <= '2024-12-23'
"""
params = {
'q': sql_query
}
response = requests.get(base_url, params=params)
data = response.json()
crime = pd.DataFrame(data['rows'])print(crime.head())
print(len(crime)) cartodb_id the_geom \
0 2 0101000020E6100000A51C8299A5C752C006342AD3DCFF...
1 4 0101000020E6100000F9245E3B64CC52C0B7195D940FF6...
2 7 0101000020E6100000118A52E7F6C052C0CFF41263190C...
3 123 0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02...
4 126 0101000020E6100000D1CCD5875CCA52C014B723FFC005...
the_geom_webmercator objectid dc_dist psa \
0 0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F... 114 25 3
1 0101000020110F00000426B7CE54EE5FC1C5E06D37E284... 116 01 1
2 0101000020110F00006728CED7EBDA5FC169DB64F8519D... 119 08 2
3 0101000020110F00009D28D4D968E25FC13CD5C3D06F92... 96 15 1
4 0101000020110F00002F28E30AE2EA5FC10090A3314796... 99 14 1
dispatch_date_time dispatch_date dispatch_time hour dc_key \
0 2023-03-11T17:12:00Z 2023-03-11 12:12:00 12.0 202325014482
1 2023-03-11T18:31:00Z 2023-03-11 13:31:00 13.0 202301004597
2 2023-03-11T22:13:00Z 2023-03-11 17:13:00 17.0 202308008412
3 2023-03-11T12:42:00Z 2023-03-11 07:42:00 7.0 202315017366
4 2023-03-12T00:54:00Z 2023-03-11 19:54:00 19.0 202314012625
location_block ucr_general text_general_code point_x \
0 3300 BLOCK HARTVILLE ST 300 Robbery No Firearm -75.119482
1 2400 BLOCK S 28TH ST 600 Theft from Vehicle -75.193618
2 9800 BLOCK Roosevelt Blvd 600 Thefts -75.015070
3 4700 BLOCK GRISCOM ST 600 Thefts -75.083953
4 5500 BLOCK BLOYD ST 300 Robbery No Firearm -75.161898
point_y
0 39.998927
1 39.922350
2 40.094525
3 40.017896
4 40.044952
477568
from shapely import wkb
crime['geometry'] = crime['the_geom'].apply(wkb.loads)
crime_gdf = gpd.GeoDataFrame(crime, geometry='geometry', crs="EPSG:4326")
if crime_gdf.crs != city_limits.crs:
crime_gdf = crime_gdf.to_crs(city_limits.crs)
print("Columns in the crime GeoDataFrame:", crime_gdf.columns.tolist())
print(crime_gdf.head())Columns in the crime GeoDataFrame: ['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist', 'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour', 'dc_key', 'location_block', 'ucr_general', 'text_general_code', 'point_x', 'point_y', 'geometry']
cartodb_id the_geom \
0 2 0101000020E6100000A51C8299A5C752C006342AD3DCFF...
1 4 0101000020E6100000F9245E3B64CC52C0B7195D940FF6...
2 7 0101000020E6100000118A52E7F6C052C0CFF41263190C...
3 123 0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02...
4 126 0101000020E6100000D1CCD5875CCA52C014B723FFC005...
the_geom_webmercator objectid dc_dist psa \
0 0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F... 114 25 3
1 0101000020110F00000426B7CE54EE5FC1C5E06D37E284... 116 01 1
2 0101000020110F00006728CED7EBDA5FC169DB64F8519D... 119 08 2
3 0101000020110F00009D28D4D968E25FC13CD5C3D06F92... 96 15 1
4 0101000020110F00002F28E30AE2EA5FC10090A3314796... 99 14 1
dispatch_date_time dispatch_date dispatch_time hour dc_key \
0 2023-03-11T17:12:00Z 2023-03-11 12:12:00 12.0 202325014482
1 2023-03-11T18:31:00Z 2023-03-11 13:31:00 13.0 202301004597
2 2023-03-11T22:13:00Z 2023-03-11 17:13:00 17.0 202308008412
3 2023-03-11T12:42:00Z 2023-03-11 07:42:00 7.0 202315017366
4 2023-03-12T00:54:00Z 2023-03-11 19:54:00 19.0 202314012625
location_block ucr_general text_general_code point_x \
0 3300 BLOCK HARTVILLE ST 300 Robbery No Firearm -75.119482
1 2400 BLOCK S 28TH ST 600 Theft from Vehicle -75.193618
2 9800 BLOCK Roosevelt Blvd 600 Thefts -75.015070
3 4700 BLOCK GRISCOM ST 600 Thefts -75.083953
4 5500 BLOCK BLOYD ST 300 Robbery No Firearm -75.161898
point_y geometry
0 39.998927 POINT (-75.11948 39.99893)
1 39.922350 POINT (-75.19362 39.92235)
2 40.094525 POINT (-75.01507 40.09452)
3 40.017896 POINT (-75.08395 40.01790)
4 40.044952 POINT (-75.16190 40.04495)
hv.extension('bokeh')crime_gdf['dispatch_date'] = pd.to_datetime(crime_gdf['dispatch_date'])
crime_gdf['year_month'] = crime_gdf['dispatch_date'].dt.to_period('M').astype(str)
crime_monthly = crime_gdf.groupby(['text_general_code', 'year_month']).size().reset_index(name='count')unique_crimes = crime_gdf['text_general_code'].unique()
print(f"Unique crime types ({len(unique_crimes)}):", unique_crimes)Unique crime types (32): ['Robbery No Firearm' 'Theft from Vehicle' 'Thefts'
'Aggravated Assault Firearm' 'Aggravated Assault No Firearm' 'Rape'
'Burglary Residential' 'Robbery Firearm' 'Burglary Non-Residential'
'Other Assaults' 'Vandalism/Criminal Mischief'
'Narcotic / Drug Law Violations' 'Fraud'
'Offenses Against Family and Children' 'Weapon Violations'
'All Other Offenses' 'DRIVING UNDER THE INFLUENCE'
'Other Sex Offenses (Not Commercialized)' 'Receiving Stolen Property'
'Liquor Law Violations' 'Arson' 'Disorderly Conduct' 'Embezzlement'
'Prostitution and Commercialized Vice' 'Forgery and Counterfeiting'
'Public Drunkenness' 'Vagrancy/Loitering' 'Gambling Violations'
'Motor Vehicle Theft' 'Homicide - Criminal' 'Homicide - Justifiable'
'Homicide - Gross Negligence']
#here I reclassify the category of crime so we wont have to test so many case later
crime_categories = {
'Violent Crime': [
'Robbery No Firearm', 'Robbery Firearm', 'Aggravated Assault Firearm',
'Aggravated Assault No Firearm', 'Homicide - Criminal', 'Homicide - Justifiable', 'Homicide - Gross Negligence'
],
'Property Crime': [
'Theft from Vehicle', 'Thefts', 'Burglary Residential', 'Burglary Non-Residential',
'Motor Vehicle Theft', 'Arson', 'Receiving Stolen Property', 'Vandalism/Criminal Mischief'
],
'Drug/Alcohol Crime': [
'Narcotic / Drug Law Violations', 'DRIVING UNDER THE INFLUENCE', 'Liquor Law Violations',
'Public Drunkenness'
],
'Sexual Offenses': [
'Rape', 'Other Sex Offenses (Not Commercialized)', 'Prostitution and Commercialized Vice'
],
'Miscellaneous': [
'Fraud', 'Offenses Against Family and Children', 'Weapon Violations', 'Disorderly Conduct',
'Embezzlement', 'Forgery and Counterfeiting', 'Vagrancy/Loitering', 'Gambling Violations', 'All Other Offenses'
]
}def map_crime_category(crime_type):
for category, crimes in crime_categories.items():
if crime_type in crimes:
return category
return 'Other'
crime_gdf['crime_category'] = crime_gdf['text_general_code'].apply(map_crime_category)
crime_monthly = crime_gdf.groupby(['crime_category', 'year_month']).size().reset_index(name='count')
crime_gdf = crime_gdf[crime_gdf['geometry'].notnull()]
crime_gdf = crime_gdf[crime_gdf['geometry'].astype(object).apply(lambda geom: geom.is_valid if geom else False)]
crime_map = crime_gdf.hvplot.points(
'geometry.x', 'geometry.y',
geo=True,
tiles='OSM',
groupby='year_month',
color='crime_category',
title='Crime Events by Month and Category',
size=5,
alpha=0.6,
height=600,
width=800
)C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\geopandas\geoseries.py:751: UserWarning: GeoSeries.notna() previously returned False for both missing (None) and empty geometries. Now, it only returns False for missing values. Since the calling GeoSeries contains empty geometries, the result has changed compared to previous versions of GeoPandas.
Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.
To further ignore this warning, you can do:
import warnings; warnings.filterwarnings('ignore', 'GeoSeries.notna', UserWarning)
return self.notna()
#try to see crime by types!
crime_monthly = crime_gdf.groupby(['text_general_code', 'year_month']).size().reset_index(name='count')
crime_plot = crime_monthly.hvplot.line(
x='year_month',
y='count',
by='text_general_code',
title='Monthly Crime Trends by Type',
xlabel='Month',
ylabel='Number of Crimes',
legend='top_left',
width=1000,
height=500
)
crime_plotcity_limits_layer = city_limits.hvplot.polygons(
geo=True,
tiles="CartoLight", # Set a light gray basemap
line_width=2,
line_color='black',
fill_alpha=0.1,
height=400,
width=600
)
city_limits_layerfrom bokeh.palettes import inferno
crime_categories = ['Violent Crime', 'Property Crime', 'Drug/Alcohol Crime', 'Sexual Offenses', 'Miscellaneous']
crime_colors = {category: inferno(len(crime_categories))[i] for i, category in enumerate(crime_categories)}#now lets visualize some data!
city_limits_layer = city_limits.hvplot.polygons(
geo=True,
tiles=None,
line_width=2,
line_color='red',
fill_alpha=0,
height=700,
width=900
)
# Create a Panel interactive dashboard
month_slider = pn.widgets.IntSlider(name='Month', start=0, end=len(crime_gdf['year_month'].unique()) - 1, step=1, value=0)
month_list = sorted(crime_gdf['year_month'].unique())
@pn.depends(month_slider.param.value, watch=True)
def update_month(selected_index):
return month_list[selected_index]
type_selector = pn.widgets.CheckBoxGroup(
name='Crime Types',
value=list(crime_colors.keys()),
options=list(crime_colors.keys())
)
@pn.depends(month_slider.param.value, type_selector.param.value)
def update_map(selected_index, selected_types):
selected_month = month_list[selected_index]
filtered_data = crime_gdf[
(crime_gdf['year_month'] == selected_month) &
(crime_gdf['crime_category'].isin(selected_types))
]
crime_layer = filtered_data.hvplot.points(
'geometry.x', 'geometry.y',
geo=True,
tiles="CartoLight",
color='crime_category',
cmap=crime_colors,
size=5,
alpha=0.6,
height=700,
width=900
)
return city_limits_layer * crime_layer
@pn.depends(month_slider.param.value, type_selector.param.value)
def update_bar_chart(selected_index, selected_types):
selected_month = month_list[selected_index] # 从索引获取月份
filtered_data = crime_gdf[
(crime_gdf['year_month'] == selected_month) &
(crime_gdf['crime_category'].isin(selected_types))
]
monthly_counts = filtered_data.groupby('crime_category').size().reset_index(name='count')
bar_chart = alt.Chart(monthly_counts).mark_bar().encode(
x=alt.X('crime_category', sort='-y', title='Crime Category'),
y=alt.Y('count', title='Count'),
color=alt.Color('crime_category', scale=alt.Scale(domain=list(crime_colors.keys()), range=list(crime_colors.values())))
).properties(
title=f"Crime Categories for {selected_month}",
width=400,
height=250
)
return bar_chart
@pn.depends(type_selector.param.value)
def update_line_chart(selected_types):
filtered_data = crime_gdf[crime_gdf['crime_category'].isin(selected_types)]
monthly_counts = filtered_data.groupby(['year_month', 'crime_category']).size().reset_index(name='count')
line_chart = alt.Chart(monthly_counts).mark_line(point=True).encode(
x=alt.X('year_month:T', title='Month'),
y=alt.Y('count:Q', title='Count'),
color=alt.Color('crime_category:N', scale=alt.Scale(domain=list(crime_colors.keys()), range=list(crime_colors.values())))
).properties(
title="Crime Trends by Month",
width=400,
height=250
)
return line_chart
# Combine bar chart and line chart into a single vertical panel
@pn.depends(month_slider.param.value, type_selector.param.value)
def combined_chart(selected_index, selected_types):
bar = update_bar_chart(selected_index, selected_types)
line = update_line_chart(selected_types)
return pn.Column(pn.pane.Vega(bar), pn.pane.Vega(line), sizing_mode='stretch_width')
# Update layout to include combined charts in the map
crime_dashboard = pn.Column(
"## Crime Dashboard",
month_slider,
type_selector,
pn.Row(
update_map,
combined_chart,
sizing_mode='stretch_width'
)
)
crime_dashboard.servable()WARNING:param.main: VegaPlot was not imported on instantiation and may not render in a notebook. Restart the notebook kernel and ensure you load it as part of the extension using:
pn.extension('vega')
####Social Data Part!####import cenpy as cny
#cny.set_sitekey("86e9f13d3e07b9344204c7b748ce3feef9772df5")acs_dataset = "ACSDT5Y2023" # ACS 5-Year dataset for 2023
conn = cny.products.APIConnection(acs_dataset)
variables_df = conn.variables
print(variables_df.head()) label \
for Census API FIPS 'for' clause
in Census API FIPS 'in' clause
ucgid Uniform Census Geography Identifier clause
B24022_060E Estimate!!Total:!!Female:!!Service occupations...
B19001B_014E Estimate!!Total:!!$100,000 to $124,999
concept predicateType \
for Census API Geography Specification fips-for
in Census API Geography Specification fips-in
ucgid Census API Geography Specification ucgid
B24022_060E Sex by Occupation and Median Earnings in the P... int
B19001B_014E Household Income in the Past 12 Months (in 202... int
group limit predicateOnly hasGeoCollectionSupport \
for N/A 0 True NaN
in N/A 0 True NaN
ucgid N/A 0 True True
B24022_060E B24022 0 NaN NaN
B19001B_014E B19001B 0 NaN NaN
attributes required
for NaN NaN
in NaN NaN
ucgid NaN NaN
B24022_060E B24022_060EA,B24022_060M,B24022_060MA NaN
B19001B_014E B19001B_014EA,B19001B_014M,B19001B_014MA NaN
#scape some social data here to visualize
acs_variables = {
"B01001_001E": "Total Population",
"B03002_006E": "Asian Alone",
"B03002_004E": "Black or African American Alone",
"B03003_003E": "Hispanic or Latino",
"B03002_003E": "White Alone",
"B01002_001E": "Median Age",
"B11001_001E": "Total Households",
"B11001_002E": "Total Families",
"B11001_003E": "Married-couple Family",
"B25001_001E": "Total Housing Units",
"B25002_002E": "Occupied Housing Units",
"B25002_003E": "Vacant Housing Units",
"B25003_002E": "Owner-occupied Housing Units",
"B25003_003E": "Renter-occupied Housing Units",
"B19013_001E": "Median Household Income",
"B17010_001E": "Poverty Status Universe",
"B17010_002E": "Below Poverty Level"
}
acs = cny.products.ACS()
data = acs.from_county(
county="Philadelphia County, PA",
variables=list(acs_variables.keys()),
level="tract",
return_geometry=True
)
data = data.rename(columns=dict(zip(acs_variables.keys(), acs_variables.values())))
data.to_file("Philadelphia_ACS_2023.shp")C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\1812328781.py:32: UserWarning: Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.
data.to_file("Philadelphia_ACS_2023.shp")
social_file_path = "Philadelphia_ACS_2023.shp"
acs_gdf = gpd.read_file(social_file_path)
if acs_gdf.crs.to_string() != "EPSG:4326":
acs_gdf = acs_gdf.to_crs(epsg=4326)
print("CRS:", acs_gdf.crs)
print("Columns:", acs_gdf.columns)
print(acs_gdf.head())
acs_gdf.explore()CRS: EPSG:4326
Columns: Index(['GEOID', 'Total Popu', 'Median Age', 'White Alon', 'Black or A',
'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami', 'Married-co',
'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1', 'Occupied H',
'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state', 'county',
'tract', 'geometry'],
dtype='object')
GEOID Total Popu Median Age White Alon Black or A Asian Alon \
0 42101009802 6190.0 33.5 580.0 5341.0 35.0
1 42101037500 3736.0 31.3 1672.0 1467.0 165.0
2 42101021900 1434.0 44.7 1307.0 50.0 24.0
3 42101011300 3344.0 28.5 62.0 3229.0 0.0
4 42101021800 5141.0 30.9 2822.0 1497.0 247.0
Hispanic o Total Hous Total Fami Married-co ... Total Ho_1 \
0 149.0 2129.0 1467.0 576.0 ... 2371.0
1 298.0 1224.0 733.0 547.0 ... 1383.0
2 41.0 654.0 307.0 238.0 ... 751.0
3 53.0 1102.0 770.0 133.0 ... 1392.0
4 178.0 2242.0 1073.0 887.0 ... 2394.0
Occupied H Vacant Hou Owner-occu Renter-occ \
0 2129.0 242.0 1593.0 536.0
1 1224.0 159.0 722.0 502.0
2 654.0 97.0 551.0 103.0
3 1102.0 290.0 621.0 481.0
4 2242.0 152.0 597.0 1645.0
NAME state county tract \
0 Census Tract 98.02, Philadelphia County, Penns... 42 101 009802
1 Census Tract 375, Philadelphia County, Pennsyl... 42 101 037500
2 Census Tract 219, Philadelphia County, Pennsyl... 42 101 021900
3 Census Tract 113, Philadelphia County, Pennsyl... 42 101 011300
4 Census Tract 218, Philadelphia County, Pennsyl... 42 101 021800
geometry
0 POLYGON ((-75.27547 39.97743, -75.27528 39.977...
1 POLYGON ((-75.26551 39.98186, -75.26475 39.982...
2 POLYGON ((-75.25438 40.04657, -75.25384 40.046...
3 POLYGON ((-75.23947 39.98111, -75.23942 39.981...
4 POLYGON ((-75.23807 40.05988, -75.23622 40.061...
[5 rows x 23 columns]
Make this Notebook Trusted to load map: File -> Trust Notebook
visualizable_columns = [
col for col in acs_gdf.columns if acs_gdf[col].dtype in [float, int]
]
print(visualizable_columns)['Total Popu', 'Median Age', 'White Alon', 'Black or A', 'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami', 'Married-co', 'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1', 'Occupied H', 'Vacant Hou', 'Owner-occu', 'Renter-occ']
#Now lets see how social data works!
import folium
from folium import GeoJson, Tooltip
import panel as pn
import mapclassify as mc
import branca.colormap as cm
classification_methods = ['Natural Breaks', 'Equal Interval']
num_classes_options = list(range(5, 11))
data_selector = pn.widgets.Select(
name="Select Data to Visualize",
options=visualizable_columns,
value=visualizable_columns[0]
)
classification_selector = pn.widgets.Select(
name="Select Classification Method",
options=classification_methods,
value=classification_methods[0]
)
num_classes_selector = pn.widgets.IntSlider(
name="Select Number of Classes",
start=5,
end=8,
step=1,
value=5
)
# data update function
@pn.depends(
data_selector.param.value,
classification_selector.param.value,
num_classes_selector.param.value
)
def update_map(selected_column, classification_method, num_classes):
m = folium.Map(
location=[39.9526, -75.1652],
zoom_start=12,
tiles="CartoDB positron",
control_scale=True,
width=900,
height=600
)
if classification_method == 'Natural Breaks':
classifier = mc.NaturalBreaks(acs_gdf[selected_column], k=num_classes)
elif classification_method == 'Equal Interval':
classifier = mc.EqualInterval(acs_gdf[selected_column], k=num_classes)
acs_gdf['class'] = classifier.yb
bins = classifier.bins
colormap = cm.LinearColormap(
colors=['#f7fcf0', '#c7e9c0', '#7fcdbb', '#41b6c4', '#2c7fb8', '#253494'],
vmin=acs_gdf[selected_column].min(),
vmax=acs_gdf[selected_column].max(),
caption=f"{selected_column} ({classification_method})"
)
fill_color_mapping = {i: colormap(v) for i, v in enumerate(bins)}
for _, row in acs_gdf.iterrows():
tooltip_text = (
f"<b>{row['NAME']}</b><br>"
f"{selected_column}: {row[selected_column]:,.2f}"
)
GeoJson(
row['geometry'],
style_function=lambda x, row=row: {
"fillColor": fill_color_mapping[row['class']],
"color": "transparent",
"weight": 0,
"fillOpacity": 0.7,
},
tooltip=Tooltip(tooltip_text)
).add_to(m)
colormap.add_to(m)
return m
dashboard = pn.Column(
pn.pane.Markdown("## Philadelphia ACS Data Visualization"),
data_selector,
classification_selector,
num_classes_selector,
pn.panel(update_map, sizing_mode="stretch_width", height=600)
)
dashboard.servable()C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
found 0 physical cores < 1
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
warnings.warn(
File "C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
raise ValueError(f"found {cpu_count_physical} physical cores < 1")
C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
warnings.warn(
####Go with the housing data web scrapingHousing_API_URL = "https://phl.carto.com/api/v2/sql"
start_date = "2022-01-01"
end_date = "2024-12-23"
query = f"""
SELECT *
FROM opa_properties_public
WHERE assessment_date >= '{start_date}' AND assessment_date <= '{end_date}'
"""
response = requests.get(f"{Housing_API_URL}?q={query}")
if response.status_code != 200:
raise Exception(f"API request failed with status {response.status_code}: {response.text}")#got so many columns that some we wont need later
Housing_data = response.json()
properties_df = pd.DataFrame(Housing_data['rows'])
print("Columns in the dataset:", properties_df.columns)Columns in the dataset: Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'assessment_date',
'basements', 'beginning_point', 'book_and_page', 'building_code',
'building_code_description', 'category_code',
'category_code_description', 'census_tract', 'central_air',
'cross_reference', 'date_exterior_condition', 'depth',
'exempt_building', 'exempt_land', 'exterior_condition', 'fireplaces',
'frontage', 'fuel', 'garage_spaces', 'garage_type',
'general_construction', 'geographic_ward', 'homestead_exemption',
'house_extension', 'house_number', 'interior_condition', 'location',
'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
'mailing_city_state', 'mailing_street', 'mailing_zip', 'market_value',
'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms',
'number_of_rooms', 'number_stories', 'off_street_open',
'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
'quality_grade', 'recording_date', 'registry_number', 'sale_date',
'sale_price', 'separate_utilities', 'sewer', 'site_type', 'state_code',
'street_code', 'street_designation', 'street_direction', 'street_name',
'suffix', 'taxable_building', 'taxable_land', 'topography',
'total_area', 'total_livable_area', 'type_heater', 'unfinished', 'unit',
'utility', 'view_type', 'year_built', 'year_built_estimate', 'zip_code',
'zoning', 'pin', 'building_code_new', 'building_code_description_new',
'objectid'],
dtype='object')
properties_df.tail()| cartodb_id | the_geom | the_geom_webmercator | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | ... | utility | view_type | year_built | year_built_estimate | zip_code | zoning | pin | building_code_new | building_code_description_new | objectid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 583657 | 584047 | 0101000020E6100000CA62F43784C452C0553088271300... | 0101000020110F0000DF8AED67F4E05FC1CB865FCEFA8F... | 2024-06-06T16:07:08Z | D | 417'8 3/4" NE LEFEVRE | 0000000 | H36 | SEMI/DET 2 STY FRAME | 1 | ... | None | I | 1920 | None | 19137 | RSA5 | 1001192329 | 32 | TWIN CONVENTIONAL | 563626477 |
| 583658 | 584048 | 0101000020E61000002876388229C352C0DAF8E3121703... | 0101000020110F00002E29FC7BA7DE5FC15D1760C65293... | 2024-06-06T16:05:14Z | None | 15' 1/2"SE DITMAN | 0000000 | O30 | ROW 2 STY MASONRY | 1 | ... | None | I | 1920 | Y | 19135 | RSA5 | 1001342218 | 24 | ROW PORCH FRONT | 563626478 |
| 583659 | 584049 | 0101000020E61000009638723466CA52C07637567AF002... | 0101000020110F0000C30DA979F2EA5FC11BC631F82793... | 2024-11-16T12:29:40Z | None | SE COR APSLEY ST | None | LB0 | IND. LGHT MFG MASONRY | 14 | ... | None | I | 2023 | None | 19144 | CMX3 | 1001681092 | 830 | APARTMENTS - MID RISE | 563626479 |
| 583660 | 584050 | 0101000020E61000009638723466CA52C07637567AF002... | 0101000020110F0000C30DA979F2EA5FC11BC631F82793... | 2024-12-14T16:53:16Z | None | SE COR APSLEY ST | None | LB0 | IND. LGHT MFG MASONRY | 10 | ... | None | I | 1920 | None | 19144 | CMX3 | 1001681093 | 241 | OFFICE BLDNG - Low Rise | 563626480 |
| 583661 | 584051 | 0101000020E6100000C75E7D144DC552C04F40CC0FA801... | 0101000020110F0000F891E19649E25FC1DADDADC3BB91... | 2024-06-06T16:05:55Z | None | 128'9" N GILLINGHAM | 0000000 | SR | VACANT LAND RESIDE < ACRE | 6 | ... | None | E | None | None | 19124 | ICMX | 1001579732 | None | None | 563626481 |
5 rows × 81 columns
from shapely import wkb
def safe_wkb_loads(wkb_str):
try:
return wkb.loads(bytes.fromhex(wkb_str))
except Exception:
return None
properties_df['parsed_geometry'] = properties_df['the_geom'].apply(safe_wkb_loads)
properties_with_geometry = properties_df[properties_df['parsed_geometry'].notnull()]
print(f"Number of properties after filtering: {len(properties_with_geometry)}")
print(properties_with_geometry.tail())Number of properties after filtering: 583662
cartodb_id the_geom \
583657 584047 0101000020E6100000CA62F43784C452C0553088271300...
583658 584048 0101000020E61000002876388229C352C0DAF8E3121703...
583659 584049 0101000020E61000009638723466CA52C07637567AF002...
583660 584050 0101000020E61000009638723466CA52C07637567AF002...
583661 584051 0101000020E6100000C75E7D144DC552C04F40CC0FA801...
the_geom_webmercator \
583657 0101000020110F0000DF8AED67F4E05FC1CB865FCEFA8F...
583658 0101000020110F00002E29FC7BA7DE5FC15D1760C65293...
583659 0101000020110F0000C30DA979F2EA5FC11BC631F82793...
583660 0101000020110F0000C30DA979F2EA5FC11BC631F82793...
583661 0101000020110F0000F891E19649E25FC1DADDADC3BB91...
assessment_date basements beginning_point book_and_page \
583657 2024-06-06T16:07:08Z D 417'8 3/4" NE LEFEVRE 0000000
583658 2024-06-06T16:05:14Z None 15' 1/2"SE DITMAN 0000000
583659 2024-11-16T12:29:40Z None SE COR APSLEY ST None
583660 2024-12-14T16:53:16Z None SE COR APSLEY ST None
583661 2024-06-06T16:05:55Z None 128'9" N GILLINGHAM 0000000
building_code building_code_description category_code ... view_type \
583657 H36 SEMI/DET 2 STY FRAME 1 ... I
583658 O30 ROW 2 STY MASONRY 1 ... I
583659 LB0 IND. LGHT MFG MASONRY 14 ... I
583660 LB0 IND. LGHT MFG MASONRY 10 ... I
583661 SR VACANT LAND RESIDE < ACRE 6 ... E
year_built year_built_estimate zip_code zoning pin \
583657 1920 None 19137 RSA5 1001192329
583658 1920 Y 19135 RSA5 1001342218
583659 2023 None 19144 CMX3 1001681092
583660 1920 None 19144 CMX3 1001681093
583661 None None 19124 ICMX 1001579732
building_code_new building_code_description_new objectid \
583657 32 TWIN CONVENTIONAL 563626477
583658 24 ROW PORCH FRONT 563626478
583659 830 APARTMENTS - MID RISE 563626479
583660 241 OFFICE BLDNG - Low Rise 563626480
583661 None None 563626481
parsed_geometry
583657 POINT (-75.07056998124895 40.00058454656452)
583658 POINT (-75.04940848840545 40.02414165622186)
583659 POINT (-75.16248809008025 40.02296380243108)
583660 POINT (-75.16248809008025 40.02296380243108)
583661 POINT (-75.08282959216295 40.01294133637622)
[5 rows x 82 columns]
columns_to_drop_housing = [
'beginning_point', 'book_and_page', 'building_code',
'building_code_description', 'category_code',
'category_code_description', 'cross_reference', 'date_exterior_condition', 'depth',
'exempt_building', 'exempt_land', 'exterior_condition',
'general_construction', 'geographic_ward', 'homestead_exemption',
'house_extension', 'location',
'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
'mailing_city_state', 'mailing_street', 'mailing_zip',
'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
'registry_number', 'separate_utilities', 'sewer', 'site_type', 'state_code',
'street_code', 'street_designation', 'street_direction', 'street_name',
'suffix', 'taxable_building', 'taxable_land', 'topography', 'unfinished', 'unit',
'utility', 'view_type', 'zip_code',
'zoning', 'pin', 'building_code_new', 'building_code_description_new'
]
columns_present_housing = [col for col in columns_to_drop_housing if col in properties_df.columns]
cleaned_properties_df = properties_df.drop(columns=columns_present_housing, axis=1)
print("Remaining columns in the cleaned dataset:")
print(cleaned_properties_df.columns)Remaining columns in the cleaned dataset:
Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'assessment_date',
'basements', 'census_tract', 'central_air', 'fireplaces', 'frontage',
'fuel', 'garage_spaces', 'garage_type', 'house_number',
'interior_condition', 'market_value', 'market_value_date',
'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms',
'number_stories', 'off_street_open', 'quality_grade', 'recording_date',
'sale_date', 'sale_price', 'total_area', 'total_livable_area',
'type_heater', 'year_built', 'year_built_estimate', 'objectid',
'parsed_geometry'],
dtype='object')
cleaned_properties_gdf = gpd.GeoDataFrame(
cleaned_properties_df,
geometry='parsed_geometry',
crs="EPSG:4326"
)
print(f"Number of properties after filtering and column dropping: {len(cleaned_properties_gdf)}")
cleaned_properties_gdf.head()Number of properties after filtering and column dropping: 583662
| cartodb_id | the_geom | the_geom_webmercator | assessment_date | basements | census_tract | central_air | fireplaces | frontage | fuel | ... | recording_date | sale_date | sale_price | total_area | total_livable_area | type_heater | year_built | year_built_estimate | objectid | parsed_geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0101000020E6100000DBD817B5A3CB52C0B69AFD4035FF... | 0101000020110F000090B604C90DED5FC1CB2D85CC048F... | 2024-06-06T16:10:03Z | None | 169 | None | NaN | 14.0 | None | ... | 2071-12-27T05:00:00Z | 1971-12-27T05:00:00Z | 1.0 | 639.0 | NaN | None | None | None | 563042662 | POINT (-75.18187 39.99381) |
| 1 | 2 | 0101000020E61000006BFC9A49A4CB52C058DAA6D48BFE... | 0101000020110F0000E00B48C50EED5FC1019687FC488E... | 2024-06-06T16:08:52Z | None | 151 | None | NaN | 14.0 | None | ... | 2071-12-27T05:00:00Z | 2071-12-27T05:00:00Z | 1.0 | 674.0 | NaN | None | None | None | 563042663 | POINT (-75.18190 39.98864) |
| 2 | 3 | 0101000020E6100000111C0D6E98CF52C0573761511EF6... | 0101000020110F00000FE20CFFC5F35FC1FBFAC589F284... | 2024-06-06T16:12:21Z | None | 64 | None | NaN | 72.0 | None | ... | 2071-12-17T05:00:00Z | 2071-12-17T05:00:00Z | 1.0 | 6636.0 | 2400.0 | None | 1925 | None | 563042664 | POINT (-75.24368 39.92280) |
| 3 | 4 | 0101000020E61000009151A5AE7ACB52C0B624691A9BFE... | 0101000020110F00007D008E19C8EC5FC11B9681EA598E... | 2024-06-06T16:09:40Z | None | 151 | None | NaN | 16.0 | None | ... | 2071-12-16T05:00:00Z | 2071-12-16T05:00:00Z | 1.0 | 992.0 | NaN | None | None | None | 563042665 | POINT (-75.17936 39.98911) |
| 4 | 5 | 0101000020E61000005A33DE5D9ECA52C0F73C1F77F5FD... | 0101000020110F0000EB7628DF51EB5FC120B8F14FA28D... | 2024-06-06T16:08:58Z | None | 152 | None | NaN | 15.0 | None | ... | 2071-12-14T05:00:00Z | 2071-12-14T05:00:00Z | 1.0 | 1122.0 | NaN | None | None | None | 563042666 | POINT (-75.16592 39.98405) |
5 rows × 32 columns
#from above, we have cleared up the dataset to what we want, and can see there are 583thousand more dataimport datashader as ds
import datashader.transfer_functions as tf
from datashader.utils import lnglat_to_meters
import geopandas as gpd
from datashader.colors import Greys9, viridis, inferno
from colorcet import fire#here trying to show how did the data points spread out bounded by city limits
philadelphia_range_path = "final_data/city_limits_3857.geojson"
city_limits = gpd.read_file(philadelphia_range_path)
if city_limits.crs.to_string() != "EPSG:4326":
city_limits = city_limits.to_crs(epsg=4326)
cleaned_properties_gdf = cleaned_properties_gdf.to_crs(epsg=3857)
city_limits = city_limits.to_crs(epsg=3857)
city_bounds = city_limits.total_bounds
x_range, y_range = (city_bounds[0], city_bounds[2]), (city_bounds[1], city_bounds[3])
cleaned_properties_gdf["x"] = cleaned_properties_gdf.geometry.x
cleaned_properties_gdf["y"] = cleaned_properties_gdf.geometry.y
canvas = ds.Canvas(
plot_width=800,
plot_height=600,
x_range=(city_limits.total_bounds[0], city_limits.total_bounds[2]), # x 范围
y_range=(city_limits.total_bounds[1], city_limits.total_bounds[3]), # y 范围
)
agg = canvas.points(cleaned_properties_gdf, "x", "y", agg=ds.count())
image = tf.shade(agg, cmap=["lightblue", "blue", "darkblue"], how="log")
fig, ax = plt.subplots(figsize=(10, 8))
ax.imshow(
image.to_pil(),
extent=(
city_limits.total_bounds[0], # xmin
city_limits.total_bounds[2], # xmax
city_limits.total_bounds[1], # ymin
city_limits.total_bounds[3], # ymax
),
origin="upper",
)
city_limits.plot(ax=ax, edgecolor="red", linewidth=1, facecolor="none")
ax.set_title("Philadelphia Housing Properties Distribution (Sampled)", fontsize=16)
plt.show()
#now lets explore the housing data by month and by census tractimport geopandas as gpd
from shapely.geometry import Point
import os
os.chdir(r"C:\Users\44792\Desktop\essay help master\5500 MUSA Python\24fall-python-final-yu_qiushi_final_project")
print(f"Changed working directory to: {os.getcwd()}")
relative_path = "final/Philadelphia_ACS_2023.shp"
try:
census_gdf = gpd.read_file(relative_path)
except FileNotFoundError:
raise FileNotFoundError(f"The file at {relative_path} does not exist. Please check the path.")
if census_gdf.crs != "EPSG:3857":
census_gdf = census_gdf.to_crs(epsg=3857)
print(f"Census Tract data loaded with {len(census_gdf)} rows.")Changed working directory to: C:\Users\44792\Desktop\essay help master\5500 MUSA Python\24fall-python-final-yu_qiushi_final_project
Census Tract data loaded with 384 rows.
census_gdf.tail()| GEOID | Total Popu | Median Age | White Alon | Black or A | Asian Alon | Hispanic o | Total Hous | Total Fami | Married-co | ... | Total Ho_1 | Occupied H | Vacant Hou | Owner-occu | Renter-occ | NAME | state | county | tract | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 379 | 42101035602 | 3661.0 | 42.1 | 2518.0 | 73.0 | 707.0 | 307.0 | 1252.0 | 999.0 | 834.0 | ... | 1284.0 | 1252.0 | 32.0 | 1022.0 | 230.0 | Census Tract 356.02, Philadelphia County, Penn... | 42 | 101 | 035602 | POLYGON ((-8355008.800 4881749.700, -8354858.9... |
| 380 | 42101035601 | 5272.0 | 55.4 | 4069.0 | 140.0 | 887.0 | 127.0 | 2155.0 | 1344.0 | 964.0 | ... | 2404.0 | 2155.0 | 249.0 | 1398.0 | 757.0 | Census Tract 356.01, Philadelphia County, Penn... | 42 | 101 | 035601 | POLYGON ((-8354256.940 4879308.850, -8354227.1... |
| 381 | 42101033102 | 4048.0 | 34.3 | 3107.0 | 299.0 | 31.0 | 438.0 | 1556.0 | 970.0 | 651.0 | ... | 1760.0 | 1556.0 | 204.0 | 935.0 | 621.0 | Census Tract 331.02, Philadelphia County, Penn... | 42 | 101 | 033102 | POLYGON ((-8352727.080 4872434.270, -8352711.8... |
| 382 | 42101034801 | 4600.0 | 44.6 | 3812.0 | 220.0 | 201.0 | 342.0 | 1969.0 | 1215.0 | 761.0 | ... | 2006.0 | 1969.0 | 37.0 | 922.0 | 1047.0 | Census Tract 348.01, Philadelphia County, Penn... | 42 | 101 | 034801 | POLYGON ((-8351724.540 4874706.750, -8351706.9... |
| 383 | 42101036201 | 5153.0 | 38.5 | 4195.0 | 308.0 | 0.0 | 419.0 | 1981.0 | 1257.0 | 839.0 | ... | 2153.0 | 1981.0 | 172.0 | 1327.0 | 654.0 | Census Tract 362.01, Philadelphia County, Penn... | 42 | 101 | 036201 | POLYGON ((-8348332.630 4877819.050, -8348033.1... |
5 rows × 23 columns
#here is trying to join the property data with the social data together, and i experiment about to show sale prices
properties_with_tract = gpd.sjoin(
cleaned_properties_gdf, census_gdf, how="inner", predicate="intersects"
)
properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")
aggregated_data = properties_with_tract.groupby(["GEOID", "year_month"]).agg(
property_count=("cartodb_id", "count"),
avg_sale_price=("sale_price", "mean")
).reset_index()
aggregated_data = aggregated_data.merge(
census_gdf[["GEOID", "geometry"]], on="GEOID", how="left"
)
aggregated_gdf = gpd.GeoDataFrame(aggregated_data, geometry="geometry", crs="EPSG:3857")C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\1386209769.py:6: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")
#and we can see, for same tract, not every month will have property register record
aggregated_gdf.tail()| GEOID | year_month | property_count | avg_sale_price | geometry | |
|---|---|---|---|---|---|
| 2386 | 42101980900 | 2024-06 | 338 | 2.272857e+06 | POLYGON ((-8378024.880 4848026.880, -8377966.4... |
| 2387 | 42101980900 | 2024-09 | 7 | 9.213524e+06 | POLYGON ((-8378024.880 4848026.880, -8377966.4... |
| 2388 | 42101980900 | 2024-11 | 1 | 5.655000e+05 | POLYGON ((-8378024.880 4848026.880, -8377966.4... |
| 2389 | 42101989100 | 2023-05 | 4 | 1.250000e+00 | POLYGON ((-8351467.390 4870423.710, -8351384.2... |
| 2390 | 42101989100 | 2024-06 | 45 | 7.283044e+05 | POLYGON ((-8351467.390 4870423.710, -8351384.2... |
example_month = "2024-06"
plot_data = aggregated_gdf[aggregated_gdf["year_month"] == example_month]
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
census_gdf.plot(ax=ax, color="lightgrey", edgecolor="black")
plot_data.plot(ax=ax, column="property_count", cmap="OrRd", legend=True)
ax.set_title(f"Housing Data Aggregation - {example_month}")
plt.show()
#now we will investigate the pattern of house prices and their character by regionsimport geopandas as gpd
from folium import GeoJson
relative_path = "final/Philadelphia_ACS_2023.shp"
try:
census_gdf = gpd.read_file(relative_path)
except FileNotFoundError:
raise FileNotFoundError(f"The file at {relative_path} does not exist. Please check the path.")
if census_gdf.crs != "EPSG:3857":
census_gdf = census_gdf.to_crs(epsg=3857)
print(f"Census Tract data loaded with {len(census_gdf)} rows.")Census Tract data loaded with 384 rows.
properties_with_tract = gpd.sjoin(
cleaned_properties_gdf, census_gdf, how="inner", predicate="intersects"
)
properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")
aggregated_data = properties_with_tract.groupby(["GEOID", "year_month"]).agg(
property_count=("cartodb_id", "count"),
avg_sale_price=("sale_price", "mean")
).reset_index()
aggregated_data['GEOID'] = aggregated_data['GEOID'].astype(str)
census_gdf['GEOID'] = census_gdf['GEOID'].astype(str)
common_geoids = set(aggregated_data['GEOID']).intersection(set(census_gdf['GEOID']))
filtered_census_gdf = census_gdf[census_gdf['GEOID'].isin(common_geoids)]
filtered_aggregated_gdf = aggregated_data[aggregated_data['GEOID'].isin(common_geoids)]
final_aggregated_gdf = filtered_aggregated_gdf.merge(
filtered_census_gdf[['GEOID', 'geometry']],
on="GEOID",
how="left"
)
final_aggregated_gdf = gpd.GeoDataFrame(final_aggregated_gdf, geometry="geometry", crs="EPSG:3857")
print(f"Final Aggregated Data Count: {len(final_aggregated_gdf)}")Final Aggregated Data Count: 2391
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\1647549544.py:5: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")
final_aggregated_gdf.tail()| GEOID | year_month | property_count | avg_sale_price | geometry | |
|---|---|---|---|---|---|
| 2386 | 42101980900 | 2024-06 | 338 | 2.272857e+06 | POLYGON ((-8378024.880 4848026.880, -8377966.4... |
| 2387 | 42101980900 | 2024-09 | 7 | 9.213524e+06 | POLYGON ((-8378024.880 4848026.880, -8377966.4... |
| 2388 | 42101980900 | 2024-11 | 1 | 5.655000e+05 | POLYGON ((-8378024.880 4848026.880, -8377966.4... |
| 2389 | 42101989100 | 2023-05 | 4 | 1.250000e+00 | POLYGON ((-8351467.390 4870423.710, -8351384.2... |
| 2390 | 42101989100 | 2024-06 | 45 | 7.283044e+05 | POLYGON ((-8351467.390 4870423.710, -8351384.2... |
#here, I try to join social and housing data together
properties_with_tract = gpd.sjoin(
cleaned_properties_gdf, census_gdf, how="inner", predicate="intersects"
)
buffered_census_gdf = census_gdf.copy()
buffered_census_gdf["geometry"] = buffered_census_gdf.geometry.buffer(10)
properties_with_tract = gpd.sjoin(
cleaned_properties_gdf, buffered_census_gdf, how="inner", predicate="intersects"
)
properties_with_tract = gpd.GeoDataFrame(
properties_with_tract,
geometry='parsed_geometry',
crs=cleaned_properties_gdf.crs
)
print(len(properties_with_tract))
properties_with_tract["assessment_date"] = pd.to_datetime(properties_with_tract["assessment_date"], errors="coerce")
properties_with_tract["year_month"] = properties_with_tract["assessment_date"].dt.to_period("M")583942
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\2193257280.py:22: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
properties_with_tract["year_month"] = properties_with_tract["assessment_date"].dt.to_period("M")
if properties_with_tract["year_month"].isnull().sum() > 0:
properties_with_tract["year_month"] = properties_with_tract["year_month"].fillna("Unknown")
properties_with_tract.tail()
#now datas are toegther!| cartodb_id | the_geom | the_geom_webmercator | assessment_date | basements | census_tract | central_air | fireplaces | frontage | fuel | ... | Total Ho_1 | Occupied H | Vacant Hou | Owner-occu | Renter-occ | NAME | state | county | tract | year_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 583657 | 584047 | 0101000020E6100000CA62F43784C452C0553088271300... | 0101000020110F0000DF8AED67F4E05FC1CB865FCEFA8F... | 2024-06-06 16:07:08+00:00 | D | 183 | N | 0.0 | 20.0 | None | ... | 1753.0 | 1539.0 | 214.0 | 1344.0 | 195.0 | Census Tract 183, Philadelphia County, Pennsyl... | 42 | 101 | 018300 | 2024-06 |
| 583658 | 584048 | 0101000020E61000002876388229C352C0DAF8E3121703... | 0101000020110F00002E29FC7BA7DE5FC15D1760C65293... | 2024-06-06 16:05:14+00:00 | None | 323 | N | 0.0 | 72.0 | None | ... | 1611.0 | 1419.0 | 192.0 | 787.0 | 632.0 | Census Tract 323, Philadelphia County, Pennsyl... | 42 | 101 | 032300 | 2024-06 |
| 583659 | 584049 | 0101000020E61000009638723466CA52C07637567AF002... | 0101000020110F0000C30DA979F2EA5FC11BC631F82793... | 2024-11-16 12:29:40+00:00 | None | 244 | None | NaN | NaN | None | ... | 1301.0 | 1011.0 | 290.0 | 460.0 | 551.0 | Census Tract 244, Philadelphia County, Pennsyl... | 42 | 101 | 024400 | 2024-11 |
| 583660 | 584050 | 0101000020E61000009638723466CA52C07637567AF002... | 0101000020110F0000C30DA979F2EA5FC11BC631F82793... | 2024-12-14 16:53:16+00:00 | None | 244 | None | NaN | NaN | None | ... | 1301.0 | 1011.0 | 290.0 | 460.0 | 551.0 | Census Tract 244, Philadelphia County, Pennsyl... | 42 | 101 | 024400 | 2024-12 |
| 583661 | 584051 | 0101000020E6100000C75E7D144DC552C04F40CC0FA801... | 0101000020110F0000F891E19649E25FC1DADDADC3BB91... | 2024-06-06 16:05:55+00:00 | None | 294 | None | NaN | 30.0 | None | ... | 1337.0 | 1127.0 | 210.0 | 496.0 | 631.0 | Census Tract 294, Philadelphia County, Pennsyl... | 42 | 101 | 029400 | 2024-06 |
5 rows × 58 columns
properties_with_tract = properties_with_tract.fillna(0)C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\2867799837.py:1: DeprecationWarning: ExtensionArray.fillna added a 'copy' keyword in pandas 2.1.0. In a future version, ExtensionArray subclasses will need to implement this keyword or an exception will be raised. In the interim, the keyword is ignored by GeometryArray.
properties_with_tract = properties_with_tract.fillna(0)
from shapely.wkb import loads as wkb_loads
if "geometry" not in properties_with_tract.columns:
if "the_geom" in properties_with_tract.columns:
print("Converting 'the_geom' to 'geometry'...")
properties_with_tract = gpd.GeoDataFrame(
properties_with_tract,
geometry=properties_with_tract["the_geom"].apply(wkb_loads),
crs="EPSG:4326"
)
elif "the_geom_webmercator" in properties_with_tract.columns:
print("Converting 'the_geom_webmercator' to 'geometry'...")
properties_with_tract = gpd.GeoDataFrame(
properties_with_tract,
geometry=properties_with_tract["the_geom_webmercator"].apply(wkb_loads),
crs="EPSG:3857"
)
else:
raise ValueError("No valid geometry column ('geometry', 'the_geom', or 'the_geom_webmercator') found!")Converting 'the_geom' to 'geometry'...
print(final_aggregated_gdf.head())
print(final_aggregated_gdf.crs) GEOID year_month property_count avg_sale_price \
0 42101000100 2023-05 59 3.703884e+06
1 42101000100 2023-08 7 4.441428e+06
2 42101000100 2023-09 2 5.725000e+06
3 42101000100 2024-02 3 2.355000e+06
4 42101000100 2024-03 1 2.025000e+06
geometry
0 POLYGON ((-8365905.970 4858674.430, -8365885.3...
1 POLYGON ((-8365905.970 4858674.430, -8365885.3...
2 POLYGON ((-8365905.970 4858674.430, -8365885.3...
3 POLYGON ((-8365905.970 4858674.430, -8365885.3...
4 POLYGON ((-8365905.970 4858674.430, -8365885.3...
EPSG:3857
##merely test!!!! I got confused at this point
print("Grouping housing data by GEOID...")
aggregated_df = properties_with_tract.groupby("GEOID", as_index=False).agg({
"sale_price": "mean",
"cartodb_id": "count",
})
aggregated_df.rename(
columns={
"sale_price": "avg_sale_price",
"cartodb_id": "property_count"
},
inplace=True
)
# merge back
census_gdf["GEOID"] = census_gdf["GEOID"].astype(str)
aggregated_df["GEOID"] = aggregated_df["GEOID"].astype(str)
final_aggregated_data = census_gdf.merge(
aggregated_df,
on="GEOID",
how="left" # all tracts kept
)
# all data here!
print("Final aggregated data shape:", final_aggregated_data.shape)
print("Final GeoDataFrame created successfully!")
print(final_aggregated_data.head())
print("Geometry type:", final_aggregated_data.geometry.geom_type.value_counts())Grouping housing data by GEOID...
Final aggregated data shape: (384, 25)
Final GeoDataFrame created successfully!
GEOID Total Popu Median Age White Alon Black or A Asian Alon \
0 42101009802 6190.0 33.5 580.0 5341.0 35.0
1 42101037500 3736.0 31.3 1672.0 1467.0 165.0
2 42101021900 1434.0 44.7 1307.0 50.0 24.0
3 42101011300 3344.0 28.5 62.0 3229.0 0.0
4 42101021800 5141.0 30.9 2822.0 1497.0 247.0
Hispanic o Total Hous Total Fami Married-co ... Vacant Hou \
0 149.0 2129.0 1467.0 576.0 ... 242.0
1 298.0 1224.0 733.0 547.0 ... 159.0
2 41.0 654.0 307.0 238.0 ... 97.0
3 53.0 1102.0 770.0 133.0 ... 290.0
4 178.0 2242.0 1073.0 887.0 ... 152.0
Owner-occu Renter-occ NAME \
0 1593.0 536.0 Census Tract 98.02, Philadelphia County, Penns...
1 722.0 502.0 Census Tract 375, Philadelphia County, Pennsyl...
2 551.0 103.0 Census Tract 219, Philadelphia County, Pennsyl...
3 621.0 481.0 Census Tract 113, Philadelphia County, Pennsyl...
4 597.0 1645.0 Census Tract 218, Philadelphia County, Pennsyl...
state county tract geometry \
0 42 101 009802 POLYGON ((-8379626.770 4862662.870, -8379606.2...
1 42 101 037500 POLYGON ((-8378518.360 4863306.140, -8378433.4...
2 42 101 021900 POLYGON ((-8377278.820 4872712.160, -8377218.9...
3 42 101 011300 POLYGON ((-8375619.820 4863198.200, -8375613.5...
4 42 101 021800 POLYGON ((-8375464.090 4874647.550, -8375257.5...
avg_sale_price property_count
0 154038.260785 2063
1 407074.872364 901
2 171537.744552 826
3 56054.240363 1323
4 195453.591292 712
[5 rows x 25 columns]
Geometry type: Polygon 384
Name: count, dtype: int64
print("Grouping geometry data by GEOID...")
geometry_data = properties_with_tract.groupby("GEOID").geometry.first()
final_aggregated_data = gpd.GeoDataFrame(
final_aggregated_data,
geometry=geometry_data,
crs=properties_with_tract.crs
)
print("Final GeoDataFrame created successfully!")
print("Sample geometries:")
print(properties_with_tract.geometry.head())
print("Geometry type:")
print(properties_with_tract.geometry.geom_type.value_counts())Grouping geometry data by GEOID...
Final GeoDataFrame created successfully!
Sample geometries:
0 POINT (-75.18187 39.99381)
1 POINT (-75.18190 39.98864)
2 POINT (-75.24368 39.92280)
3 POINT (-75.17936 39.98911)
4 POINT (-75.16592 39.98405)
Name: geometry, dtype: geometry
Geometry type:
Point 583942
Name: count, dtype: int64
geometry_data = geometry_data.reindex(final_aggregated_data["GEOID"]).reset_index(drop=True)
final_aggregated_data = gpd.GeoDataFrame(
final_aggregated_data,
geometry=geometry_data,
crs=properties_with_tract.crs
)#now lets make a merge to census tract data from different month, and find out price and feature, static data are fixed while I want to see changes across month for dynamic datafrom shapely.wkb import loads as wkb_loads
if "geometry" not in properties_with_tract.columns:
if "the_geom" in properties_with_tract.columns:
properties_with_tract = gpd.GeoDataFrame(
properties_with_tract,
geometry=properties_with_tract["the_geom"].apply(wkb_loads),
crs="EPSG:3857"
)
elif "the_geom_webmercator" in properties_with_tract.columns:
properties_with_tract = gpd.GeoDataFrame(
properties_with_tract,
geometry=properties_with_tract["the_geom_webmercator"].apply(wkb_loads),
crs="EPSG:3857"
)
else:
raise ValueError("No valid geometry column found!")
geometry_data = properties_with_tract.groupby("GEOID").geometry.first()
print("Geometry data after grouping:")
print(geometry_data.head())
print("Null geometries count:", geometry_data.isnull().sum())
geometry_data = geometry_data.reindex(final_aggregated_data["GEOID"]).reset_index(drop=True)
final_aggregated_data = gpd.GeoDataFrame(
final_aggregated_data,
geometry=geometry_data,
crs=properties_with_tract.crs
)
print("Final aggregated data geometries:")
print(final_aggregated_data.geometry.head())
print("Geometry types:")
print(final_aggregated_data.geometry.geom_type.value_counts())Geometry data after grouping:
GEOID
42101000100 POINT (-75.14808 39.95194)
42101000200 POINT (-75.16098 39.95747)
42101000300 POINT (-75.16542 39.95504)
42101000401 POINT (-75.17577 39.95263)
42101000402 POINT (-75.16494 39.95148)
Name: geometry, dtype: geometry
Null geometries count: 0
Final aggregated data geometries:
0 POINT (-75.26348 39.97478)
1 POINT (-75.25241 39.98796)
2 POINT (-75.24063 40.04991)
3 POINT (-75.23341 39.97987)
4 POINT (-75.23413 40.05904)
Name: geometry, dtype: geometry
Geometry types:
Point 384
Name: count, dtype: int64
properties_with_tract["property_count"] = 1
aggregated_data = (
properties_with_tract
.groupby("GEOID", as_index=False) # no month any more, I previously did by month, removed now
.agg(
total_property_count=("property_count", "sum"),
avg_rooms=("number_of_rooms", "mean"),
avg_bathrooms=("number_of_bathrooms", "mean"),
avg_bedrooms=("number_of_bedrooms", "mean"),
avg_sale_price=("sale_price", "mean"),
avg_market_value=("market_value", "mean")
)
)
aggregated_data["avg_sale_price"] = (aggregated_data["avg_sale_price"] / 1_000_000).round(2)
aggregated_data["avg_market_value"] = (aggregated_data["avg_market_value"] / 1_000_000).round(2)
static_columns = [
"GEOID", "Total Popu", "Median Age", "White Alon", "Black or A",
"Asian Alon", "Hispanic o", "Total Hous", "Total Fami",
"Married-co", "Poverty St", "Below Pove", "Median Hou",
"Occupied H", "Vacant Hou", "Owner-occu", "Renter-occ"
]
static_data = (
properties_with_tract[static_columns]
.drop_duplicates(subset="GEOID")
)
final_aggregated_data = aggregated_data.merge(
static_data, on="GEOID", how="left"
)
census_gdf["GEOID"] = census_gdf["GEOID"].astype(str)
final_aggregated_data["GEOID"] = final_aggregated_data["GEOID"].astype(str)
final_aggregated_data = census_gdf.merge(
final_aggregated_data,
on="GEOID",
how="left"
)
final_aggregated_data = gpd.GeoDataFrame(
final_aggregated_data,
geometry=final_aggregated_data.geometry,
crs=census_gdf.crs
)
print("Final aggregated data geometry type:")
print(final_aggregated_data.geometry.geom_type.value_counts())
print(final_aggregated_data.columns)
#here I encountered some duplicated column names, so I drop them later, u may ask why I not going back? I dont really know what is the source of the problem, so maybe just not touch itFinal aggregated data geometry type:
Polygon 384
Name: count, dtype: int64
Index(['GEOID', 'Total Popu_x', 'Median Age_x', 'White Alon_x', 'Black or A_x',
'Asian Alon_x', 'Hispanic o_x', 'Total Hous_x', 'Total Fami_x',
'Married-co_x', 'Poverty St_x', 'Below Pove_x', 'Median Hou_x',
'Total Ho_1', 'Occupied H_x', 'Vacant Hou_x', 'Owner-occu_x',
'Renter-occ_x', 'NAME', 'state', 'county', 'tract', 'geometry',
'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
'Owner-occu_y', 'Renter-occ_y'],
dtype='object')
cols_to_drop = [col for col in final_aggregated_data.columns if col.endswith('_x')]
final_aggregated_data.drop(columns=cols_to_drop, inplace=True)
print(final_aggregated_data.columns)Index(['GEOID', 'Total Ho_1', 'NAME', 'state', 'county', 'tract', 'geometry',
'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
'Owner-occu_y', 'Renter-occ_y'],
dtype='object')
print(final_aggregated_data.tail()) GEOID Total Ho_1 \
379 42101035602 1284.0
380 42101035601 2404.0
381 42101033102 1760.0
382 42101034801 2006.0
383 42101036201 2153.0
NAME state county tract \
379 Census Tract 356.02, Philadelphia County, Penn... 42 101 035602
380 Census Tract 356.01, Philadelphia County, Penn... 42 101 035601
381 Census Tract 331.02, Philadelphia County, Penn... 42 101 033102
382 Census Tract 348.01, Philadelphia County, Penn... 42 101 034801
383 Census Tract 362.01, Philadelphia County, Penn... 42 101 036201
geometry total_property_count \
379 POLYGON ((-8355008.800 4881749.700, -8354858.9... 1126
380 POLYGON ((-8354256.940 4879308.850, -8354227.1... 2064
381 POLYGON ((-8352727.080 4872434.270, -8352711.8... 1410
382 POLYGON ((-8351724.540 4874706.750, -8351706.9... 875
383 POLYGON ((-8348332.630 4877819.050, -8348033.1... 1563
avg_rooms avg_bathrooms ... Total Hous_y Total Fami_y Married-co_y \
379 0.000000 0.267318 ... 1252.0 999.0 834.0
380 0.008721 0.706880 ... 2155.0 1344.0 964.0
381 0.111348 0.735461 ... 1556.0 970.0 651.0
382 0.272000 0.657143 ... 1969.0 1215.0 761.0
383 0.000000 0.852847 ... 1981.0 1257.0 839.0
Poverty St_y Below Pove_y Median Hou_y Occupied H_y Vacant Hou_y \
379 999.0 0.0 81912.0 1252.0 32.0
380 1344.0 172.0 52673.0 2155.0 249.0
381 970.0 119.0 42692.0 1556.0 204.0
382 1215.0 34.0 57463.0 1969.0 37.0
383 1257.0 57.0 74360.0 1981.0 172.0
Owner-occu_y Renter-occ_y
379 1022.0 230.0
380 1398.0 757.0
381 935.0 621.0
382 922.0 1047.0
383 1327.0 654.0
[5 rows x 29 columns]
def fix_missing_rooms(gdf):
mask_missing_rooms = gdf["avg_rooms"].isna()
mask_have_bed_bath = (~gdf["avg_bedrooms"].isna()) & (~gdf["avg_bathrooms"].isna())
gdf.loc[mask_missing_rooms & mask_have_bed_bath, "avg_rooms"] = (
gdf["avg_bedrooms"] + gdf["avg_bathrooms"] + 0.5
)
gdf.loc[mask_missing_rooms & ~mask_have_bed_bath, "avg_rooms"] = 1
return gdf
final_aggregated_data = fix_missing_rooms(final_aggregated_data)#now create panel selector
dynamic_cols = [
'total_property_count', 'avg_rooms', 'avg_bathrooms',
'avg_bedrooms', 'avg_sale_price', 'avg_market_value'
]
#also let guys see the sale and market value
tooltip_cols = [
'Total Popu_y', 'Median Age_y','White Alon_y', 'Black or A_y', 'Asian Alon_y',
'Hispanic o_y','Total Hous_y', 'Total Fami_y', 'Married-co_y','Poverty St_y',
'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
'Owner-occu_y', 'Renter-occ_y'
]
final_aggregated_data = final_aggregated_data.to_crs(epsg=4326)
from shapely.geometry import mappingimport folium
import geopandas as gpd
import panel as pn
import altair as alt
import mapclassify
import numpy as np
gdf = final_aggregated_data
###############################
# Function/Legend!
###############################
def get_natural_breaks_bins(data_series, k=8, lower_q=0.01, upper_q=0.99):
series_no_na = data_series.dropna()
lower_val = series_no_na.quantile(lower_q)
upper_val = series_no_na.quantile(upper_q)
clipped_vals = np.clip(series_no_na, lower_val, upper_val)
nb = mapclassify.NaturalBreaks(clipped_vals, k=k)
breaks = nb.bins.tolist()
min_val = series_no_na.min()
max_val = series_no_na.max()
bins = [min_val] + breaks
bins[-1] = max_val
return bins
def create_legend_html(bins, legend_name):
labels = []
for i in range(len(bins)-1):
start_val = bins[i]
end_val = bins[i+1]
labels.append(f"{start_val:.2f} ~ {end_val:.2f}")
legend_items = "".join([f"<li>{lbl}</li>" for lbl in labels])
legend_html = f"""
<div style="
position: absolute;
z-index:9999;
bottom: 80px;
right: 10px;
background-color: white;
border:2px solid grey;
border-radius:5px;
padding: 10px;
">
<h4 style="margin-top:0">{legend_name}</h4>
<ul style="list-style:none; padding-left:0; margin:0;">
{legend_items}
</ul>
</div>
"""
return legend_html
###############################
# Map!
###############################
def create_folium_map(selected_col):
center_lat = gdf.geometry.centroid.y.mean()
center_lon = gdf.geometry.centroid.x.mean()
m = folium.Map(
location=[center_lat, center_lon],
zoom_start=11,
tiles='CartoDB Positron',
control_scale=True
)
geojson_data = gdf.to_json()
bins = get_natural_breaks_bins(gdf[selected_col], k=8, lower_q=0.01, upper_q=0.99)
choropleth = folium.Choropleth(
geo_data=geojson_data,
data=gdf,
columns=['GEOID', selected_col],
key_on='feature.properties.GEOID',
bins=bins,
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
highlight=True,
legend_name=f'{selected_col} (Natural Breaks)',
)
choropleth.add_to(m)
style_statement = """
<style>
.legend {
display: none !important;
}
</style>
"""
m.get_root().header.add_child(folium.Element(style_statement))
# Tooltip
folium.GeoJsonTooltip(
fields=['GEOID'] + tooltip_cols,
aliases=['GEOID'] + tooltip_cols,
localize=True
).add_to(choropleth.geojson)
legend_html = create_legend_html(bins, f"{selected_col} (Natural Breaks)")
m.get_root().html.add_child(folium.Element(legend_html))
return m
###############################
# Histo!
###############################
def create_histogram(selected_col):
df = gdf.copy()
chart = (
alt.Chart(df)
.mark_bar()
.encode(
x=alt.X(f"{selected_col}:Q", bin=alt.Bin(maxbins=30), title=f"{selected_col}"),
y=alt.Y('count()', title='Count')
)
.properties(
width=400,
height=250,
title=f"Distribution of {selected_col} (Natural Breaks used on map)"
)
)
return chart
###############################
# 4. Panel
###############################
select_widget = pn.widgets.Select(
name="Select a column to visualize",
options=dynamic_cols,
value=dynamic_cols[0]
)
@pn.depends(select_widget.param.value)
def update_view(selected_col):
folium_map = create_folium_map(selected_col)
folium_html = folium_map._repr_html_()
folium_pane = pn.pane.HTML(folium_html, width=700, height=500)
hist_chart = create_histogram(selected_col)
hist_pane = pn.pane.Vega(hist_chart, width=400, height=300)
return pn.Row(folium_pane, hist_pane)
dashboard = pn.Column(
"# Census Tract Interactive Dashboard",
"select indicator to continue:",
select_widget,
update_view
)
dashboard.servable()C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\4061076871.py:59: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
center_lat = gdf.geometry.centroid.y.mean()
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\4061076871.py:60: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
center_lon = gdf.geometry.centroid.x.mean()
C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
warnings.warn(
###Now, Question 1, OSL crime_gdf = crime_gdf.to_crs(epsg=3857)
census_gdf = census_gdf.to_crs(epsg=3857)
crime_by_tract = gpd.sjoin(
crime_gdf,
census_gdf,
how='inner',
predicate='intersects'
)
print("crime_by_tract columns:", crime_by_tract.columns)
print("Number of records after sjoin:", len(crime_by_tract))crime_by_tract columns: Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist',
'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour',
'dc_key', 'location_block', 'ucr_general', 'text_general_code',
'point_x', 'point_y', 'geometry', 'year_month', 'crime_category',
'index_right', 'GEOID', 'Total Popu', 'Median Age', 'White Alon',
'Black or A', 'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami',
'Married-co', 'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1',
'Occupied H', 'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state',
'county', 'tract'],
dtype='object')
Number of records after sjoin: 462145
crime_count_by_tract = crime_by_tract.groupby("GEOID").size().reset_index(name="total_crime_count")
print(crime_count_by_tract.head())
crime_count_by_tract_type = crime_by_tract.groupby(["GEOID","crime_category"]).size().unstack(fill_value=0) GEOID total_crime_count
0 42101000100 2047
1 42101000200 1192
2 42101000300 1946
3 42101000401 810
4 42101000402 2991
census_gdf = census_gdf.merge(crime_count_by_tract, on="GEOID", how="left")
census_gdf["total_crime_count"] = census_gdf["total_crime_count"].fillna(0)
census_gdf.head()| GEOID | Total Popu | Median Age | White Alon | Black or A | Asian Alon | Hispanic o | Total Hous | Total Fami | Married-co | ... | Occupied H | Vacant Hou | Owner-occu | Renter-occ | NAME | state | county | tract | geometry | total_crime_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42101009802 | 6190.0 | 33.5 | 580.0 | 5341.0 | 35.0 | 149.0 | 2129.0 | 1467.0 | 576.0 | ... | 2129.0 | 242.0 | 1593.0 | 536.0 | Census Tract 98.02, Philadelphia County, Penns... | 42 | 101 | 009802 | POLYGON ((-8379626.770 4862662.870, -8379606.2... | 1031 |
| 1 | 42101037500 | 3736.0 | 31.3 | 1672.0 | 1467.0 | 165.0 | 298.0 | 1224.0 | 733.0 | 547.0 | ... | 1224.0 | 159.0 | 722.0 | 502.0 | Census Tract 375, Philadelphia County, Pennsyl... | 42 | 101 | 037500 | POLYGON ((-8378518.360 4863306.140, -8378433.4... | 486 |
| 2 | 42101021900 | 1434.0 | 44.7 | 1307.0 | 50.0 | 24.0 | 41.0 | 654.0 | 307.0 | 238.0 | ... | 654.0 | 97.0 | 551.0 | 103.0 | Census Tract 219, Philadelphia County, Pennsyl... | 42 | 101 | 021900 | POLYGON ((-8377278.820 4872712.160, -8377218.9... | 167 |
| 3 | 42101011300 | 3344.0 | 28.5 | 62.0 | 3229.0 | 0.0 | 53.0 | 1102.0 | 770.0 | 133.0 | ... | 1102.0 | 290.0 | 621.0 | 481.0 | Census Tract 113, Philadelphia County, Pennsyl... | 42 | 101 | 011300 | POLYGON ((-8375619.820 4863198.200, -8375613.5... | 975 |
| 4 | 42101021800 | 5141.0 | 30.9 | 2822.0 | 1497.0 | 247.0 | 178.0 | 2242.0 | 1073.0 | 887.0 | ... | 2242.0 | 152.0 | 597.0 | 1645.0 | Census Tract 218, Philadelphia County, Pennsyl... | 42 | 101 | 021800 | POLYGON ((-8375464.090 4874647.550, -8375257.5... | 490 |
5 rows × 24 columns
census_gdf["white_pop"] = census_gdf["White Alon"]
census_gdf["black_pop"] = census_gdf["Black or A"]
census_gdf["hisp_pop"] = census_gdf["Hispanic o"]
census_gdf["total_pop"] = census_gdf["Total Popu"]
census_gdf["other_pop"] = census_gdf["total_pop"] - (
census_gdf["white_pop"] + census_gdf["black_pop"] + census_gdf["hisp_pop"]
)
# clip out negative
census_gdf["other_pop"] = census_gdf["other_pop"].clip(lower=0)
# race ratio
census_gdf["white_ratio"] = census_gdf["white_pop"] / census_gdf["total_pop"]
census_gdf["black_ratio"] = census_gdf["black_pop"] / census_gdf["total_pop"]
census_gdf["hisp_ratio"] = census_gdf["hisp_pop"] / census_gdf["total_pop"]
census_gdf["other_ratio"] = census_gdf["other_pop"] / census_gdf["total_pop"]
def calc_diversity(row):
return 1 - (
(row["white_ratio"]**2) +
(row["black_ratio"]**2) +
(row["hisp_ratio"]**2) +
(row["other_ratio"]**2)
)
census_gdf["diversity_index"] = census_gdf.apply(calc_diversity, axis=1)
census_gdf.head()| GEOID | Total Popu | Median Age | White Alon | Black or A | Asian Alon | Hispanic o | Total Hous | Total Fami | Married-co | ... | white_pop | black_pop | hisp_pop | total_pop | other_pop | white_ratio | black_ratio | hisp_ratio | other_ratio | diversity_index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42101009802 | 6190.0 | 33.5 | 580.0 | 5341.0 | 35.0 | 149.0 | 2129.0 | 1467.0 | 576.0 | ... | 580.0 | 5341.0 | 149.0 | 6190.0 | 120.0 | 0.093700 | 0.862843 | 0.024071 | 0.019386 | 0.245767 |
| 1 | 42101037500 | 3736.0 | 31.3 | 1672.0 | 1467.0 | 165.0 | 298.0 | 1224.0 | 733.0 | 547.0 | ... | 1672.0 | 1467.0 | 298.0 | 3736.0 | 299.0 | 0.447537 | 0.392666 | 0.079764 | 0.080032 | 0.632756 |
| 2 | 42101021900 | 1434.0 | 44.7 | 1307.0 | 50.0 | 24.0 | 41.0 | 654.0 | 307.0 | 238.0 | ... | 1307.0 | 50.0 | 41.0 | 1434.0 | 36.0 | 0.911437 | 0.034868 | 0.028591 | 0.025105 | 0.166620 |
| 3 | 42101011300 | 3344.0 | 28.5 | 62.0 | 3229.0 | 0.0 | 53.0 | 1102.0 | 770.0 | 133.0 | ... | 62.0 | 3229.0 | 53.0 | 3344.0 | 0.0 | 0.018541 | 0.965610 | 0.015849 | 0.000000 | 0.067002 |
| 4 | 42101021800 | 5141.0 | 30.9 | 2822.0 | 1497.0 | 247.0 | 178.0 | 2242.0 | 1073.0 | 887.0 | ... | 2822.0 | 1497.0 | 178.0 | 5141.0 | 644.0 | 0.548920 | 0.291188 | 0.034624 | 0.125267 | 0.597005 |
5 rows × 34 columns
house_cols = ["GEOID", "avg_sale_price", "avg_rooms", "total_property_count"]
census_gdf = census_gdf.merge(final_aggregated_data[house_cols], on="GEOID", how="left")
census_gdf[["avg_sale_price","avg_rooms","total_property_count"]] = \
census_gdf[["avg_sale_price","avg_rooms","total_property_count"]].fillna(0)census_gdf.head()
print(census_gdf.columns)Index(['GEOID', 'Total Popu', 'Median Age', 'White Alon', 'Black or A',
'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami', 'Married-co',
'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1', 'Occupied H',
'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state', 'county',
'tract', 'geometry', 'total_crime_count', 'white_pop', 'black_pop',
'hisp_pop', 'total_pop', 'other_pop', 'white_ratio', 'black_ratio',
'hisp_ratio', 'other_ratio', 'diversity_index', 'avg_sale_price',
'avg_rooms', 'total_property_count'],
dtype='object')
#now we keep only those columns we think useful and possibly related (tho not all of them may use later)
columns_to_keep = [
"GEOID",'Total Popu', 'Median Age','Poverty St', 'Below Pove','Married-co',
"white_ratio", "black_ratio", "hisp_ratio", "other_ratio", "diversity_index",
"total_crime_count", "avg_sale_price", "avg_rooms", "total_property_count",
"geometry"
]
final_cols = [col for col in columns_to_keep if col in census_gdf.columns]
census_gdf = census_gdf[final_cols]
census_gdf.head()| GEOID | Total Popu | Median Age | Poverty St | Below Pove | Married-co | white_ratio | black_ratio | hisp_ratio | other_ratio | diversity_index | total_crime_count | avg_sale_price | avg_rooms | total_property_count | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42101009802 | 6190.0 | 33.5 | 1467.0 | 282.0 | 576.0 | 0.093700 | 0.862843 | 0.024071 | 0.019386 | 0.245767 | 1031 | 0.15 | 0.002908 | 2063 | POLYGON ((-8379626.770 4862662.870, -8379606.2... |
| 1 | 42101037500 | 3736.0 | 31.3 | 733.0 | 55.0 | 547.0 | 0.447537 | 0.392666 | 0.079764 | 0.080032 | 0.632756 | 486 | 0.41 | 0.022198 | 901 | POLYGON ((-8378518.360 4863306.140, -8378433.4... |
| 2 | 42101021900 | 1434.0 | 44.7 | 307.0 | 0.0 | 238.0 | 0.911437 | 0.034868 | 0.028591 | 0.025105 | 0.166620 | 167 | 0.17 | 0.176755 | 826 | POLYGON ((-8377278.820 4872712.160, -8377218.9... |
| 3 | 42101011300 | 3344.0 | 28.5 | 770.0 | 333.0 | 133.0 | 0.018541 | 0.965610 | 0.015849 | 0.000000 | 0.067002 | 975 | 0.06 | 0.000000 | 1323 | POLYGON ((-8375619.820 4863198.200, -8375613.5... |
| 4 | 42101021800 | 5141.0 | 30.9 | 1073.0 | 92.0 | 887.0 | 0.548920 | 0.291188 | 0.034624 | 0.125267 | 0.597005 | 490 | 0.20 | 0.037921 | 712 | POLYGON ((-8375464.090 4874647.550, -8375257.5... |
print(final_aggregated_data.columns)Index(['GEOID', 'Total Ho_1', 'NAME', 'state', 'county', 'tract', 'geometry',
'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
'Owner-occu_y', 'Renter-occ_y'],
dtype='object')
census_gdf = census_gdf.merge(
final_aggregated_data[["GEOID","Total Popu_y"]],
on="GEOID",
how="left"
)census_gdf = census_gdf.merge(
final_aggregated_data[["GEOID",'Median Age_y','Poverty St_y','Below Pove_y','Vacant Hou_y']],
on="GEOID",
how="left"
)census_gdf.head()| GEOID | Total Popu | Median Age | Poverty St | Below Pove | Married-co | white_ratio | black_ratio | hisp_ratio | other_ratio | ... | total_crime_count | avg_sale_price | avg_rooms | total_property_count | geometry | Total Popu_y | Median Age_y | Poverty St_y | Below Pove_y | Vacant Hou_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42101009802 | 6190.0 | 33.5 | 1467.0 | 282.0 | 576.0 | 0.093700 | 0.862843 | 0.024071 | 0.019386 | ... | 1031 | 0.15 | 0.002908 | 2063 | POLYGON ((-8379626.770 4862662.870, -8379606.2... | 6190.0 | 33.5 | 1467.0 | 282.0 | 242.0 |
| 1 | 42101037500 | 3736.0 | 31.3 | 733.0 | 55.0 | 547.0 | 0.447537 | 0.392666 | 0.079764 | 0.080032 | ... | 486 | 0.41 | 0.022198 | 901 | POLYGON ((-8378518.360 4863306.140, -8378433.4... | 3736.0 | 31.3 | 733.0 | 55.0 | 159.0 |
| 2 | 42101021900 | 1434.0 | 44.7 | 307.0 | 0.0 | 238.0 | 0.911437 | 0.034868 | 0.028591 | 0.025105 | ... | 167 | 0.17 | 0.176755 | 826 | POLYGON ((-8377278.820 4872712.160, -8377218.9... | 1434.0 | 44.7 | 307.0 | 0.0 | 97.0 |
| 3 | 42101011300 | 3344.0 | 28.5 | 770.0 | 333.0 | 133.0 | 0.018541 | 0.965610 | 0.015849 | 0.000000 | ... | 975 | 0.06 | 0.000000 | 1323 | POLYGON ((-8375619.820 4863198.200, -8375613.5... | 3344.0 | 28.5 | 770.0 | 333.0 | 290.0 |
| 4 | 42101021800 | 5141.0 | 30.9 | 1073.0 | 92.0 | 887.0 | 0.548920 | 0.291188 | 0.034624 | 0.125267 | ... | 490 | 0.20 | 0.037921 | 712 | POLYGON ((-8375464.090 4874647.550, -8375257.5... | 5141.0 | 30.9 | 1073.0 | 92.0 | 152.0 |
5 rows × 21 columns
census_gdf["crime_rate"] = (census_gdf["total_crime_count"] / census_gdf["Total Popu_y"]) * 1000
census_gdf["crime_rate"] = census_gdf["crime_rate"].fillna(0).replace([np.inf, -np.inf], 0)census_gdf = census_gdf[ (census_gdf["Total Popu_y"] > 0) & (census_gdf["crime_rate"].notnull()) ]
# Dependent V
y = census_gdf["crime_rate"]
# independent V
X = census_gdf[["black_ratio", "white_ratio", "avg_sale_price"]]import statsmodels.api as sm
# I manually add the intercept here, it is constant for all variable so should have no infuence
X = sm.add_constant(X)
X.head()| const | black_ratio | white_ratio | avg_sale_price | |
|---|---|---|---|---|
| 0 | 1.0 | 0.862843 | 0.093700 | 0.15 |
| 1 | 1.0 | 0.392666 | 0.447537 | 0.41 |
| 2 | 1.0 | 0.034868 | 0.911437 | 0.17 |
| 3 | 1.0 | 0.965610 | 0.018541 | 0.06 |
| 4 | 1.0 | 0.291188 | 0.548920 | 0.20 |
model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(results.summary()) OLS Regression Results
==============================================================================
Dep. Variable: crime_rate R-squared: 0.003
Model: OLS Adj. R-squared: -0.005
Method: Least Squares F-statistic: 0.3550
Date: Thu, 26 Dec 2024 Prob (F-statistic): 0.786
Time: 11:59:35 Log-Likelihood: -3156.2
No. Observations: 376 AIC: 6320.
Df Residuals: 372 BIC: 6336.
Df Model: 3
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
const 574.9899 238.005 2.416 0.016 106.987 1042.993
black_ratio -267.0188 295.893 -0.902 0.367 -848.852 314.814
white_ratio -236.9274 330.994 -0.716 0.475 -887.782 413.927
avg_sale_price 3.9940 9.458 0.422 0.673 -14.603 22.591
==============================================================================
Omnibus: 810.129 Durbin-Watson: 1.978
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1125693.675
Skew: 15.633 Prob(JB): 0.00
Kurtosis: 269.224 Cond. No. 51.8
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.stats.outliers_influence import variance_inflation_factor
# X_no_const = X.drop(columns="const", errors="ignore") # statsmodels add_constant() 可能会加const
# We do:
X_no_const = X.loc[:, X.columns != 'const']
vif_data = pd.DataFrame()
vif_data["feature"] = X_no_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_no_const.values, i) for i in range(X_no_const.shape[1])]
print(vif_data) feature VIF
0 black_ratio 1.064220
1 white_ratio 1.072523
2 avg_sale_price 1.011484
import matplotlib.pyplot as plt
y_pred = results.predict(X)
plt.scatter(y, y_pred, alpha=0.5)
plt.xlabel("Actual Crime Rate")
plt.ylabel("Predicted Crime Rate")
plt.title("OLS Fit: Actual vs. Predicted")
plt.show()
census_gdf["resid"] = results.resid
#lets do some improvement
census_gdf["Median Age_y"] = census_gdf["Median Age_y"].fillna(0)
census_gdf["Poverty St_y"] = census_gdf["Poverty St_y"].fillna(0)
census_gdf["Below Pove_y"] = census_gdf["Below Pove_y"].fillna(0)
census_gdf["Vacant Hou_y"] = census_gdf["Vacant Hou_y"].fillna(0)
census_gdf["below_poverty_ratio"] = (
census_gdf["Below Pove_y"] / census_gdf["Total Popu_y"]
).replace([np.inf, -np.inf], 0).fillna(0)feature_cols = [
"black_ratio",
"white_ratio",
"avg_sale_price",
"Median Age_y",
"below_poverty_ratio",
"Vacant Hou_y"
]import statsmodels.api as sm
# same y
y = census_gdf["crime_rate"]
# new independent
X = census_gdf[feature_cols].copy()
X = X.fillna(0)
# make a log to price to avoid extreme values
X["avg_sale_price"] = np.log1p(X["avg_sale_price"])
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
from statsmodels.stats.outliers_influence import variance_inflation_factor
X_no_const = X.drop(columns=["const"], errors="ignore")
vif_data = pd.DataFrame()
vif_data["feature"] = X_no_const.columns
vif_data["VIF"] = [
variance_inflation_factor(X_no_const.values, i)
for i in range(X_no_const.shape[1])
]
print(vif_data) OLS Regression Results
==============================================================================
Dep. Variable: crime_rate R-squared: 0.050
Model: OLS Adj. R-squared: 0.035
Method: Least Squares F-statistic: 3.237
Date: Thu, 26 Dec 2024 Prob (F-statistic): 0.00411
Time: 11:59:35 Log-Likelihood: -3147.1
No. Observations: 376 AIC: 6308.
Df Residuals: 369 BIC: 6336.
Df Model: 6
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const -582.6870 429.256 -1.357 0.175 -1426.783 261.409
black_ratio 189.6411 326.899 0.580 0.562 -453.178 832.460
white_ratio 667.2105 427.683 1.560 0.120 -173.791 1508.212
avg_sale_price 281.4062 149.743 1.879 0.061 -13.051 575.863
Median Age_y 8.4286 8.189 1.029 0.304 -7.674 24.531
below_poverty_ratio 1.151e+04 2816.750 4.086 0.000 5971.367 1.7e+04
Vacant Hou_y -0.6743 0.421 -1.600 0.110 -1.503 0.154
==============================================================================
Omnibus: 781.262 Durbin-Watson: 1.971
Prob(Omnibus): 0.000 Jarque-Bera (JB): 896403.493
Skew: 14.465 Prob(JB): 0.00
Kurtosis: 240.445 Cond. No. 1.41e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.41e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
feature VIF
0 black_ratio 8.066012
1 white_ratio 8.710505
2 avg_sale_price 1.488647
3 Median Age_y 19.326683
4 below_poverty_ratio 3.727617
5 Vacant Hou_y 4.233158
#What a pity, basically we cant really say the crime is related with sales prices and race yet, but it does highly realtes with poverty rate, lets try delete highly VIF factor to see if there are improvementsfeature_cols = [
"black_ratio",
"white_ratio",
"avg_sale_price",
"Median Age_y", # VIF≈19
"below_poverty_ratio",
"Vacant Hou_y"
]
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
df = census_gdf.copy()
X = df[feature_cols].fillna(0).copy()
X = sm.add_constant(X)
X_no_const = X.drop(columns=["const"], errors="ignore")
vif_data = pd.DataFrame()
vif_data["feature"] = X_no_const.columns
vif_data["VIF"] = [
variance_inflation_factor(X_no_const.values, i) for i in range(X_no_const.shape[1])
]
print("Initial VIF:\n", vif_data)Initial VIF:
feature VIF
0 black_ratio 8.124570
1 white_ratio 8.672783
2 avg_sale_price 1.040092
3 Median Age_y 19.298617
4 below_poverty_ratio 3.655698
5 Vacant Hou_y 4.147828
feature_cols_reduced = [
"black_ratio",
"white_ratio",
# "Median Age_y", # NO MORE!
"avg_sale_price",
"below_poverty_ratio",
"Vacant Hou_y"
]
X2 = df[feature_cols_reduced].fillna(0).copy()
X2 = sm.add_constant(X2)
X2_no_const = X2.drop(columns=["const"], errors="ignore")
vif_data2 = pd.DataFrame()
vif_data2["feature"] = X2_no_const.columns
vif_data2["VIF"] = [
variance_inflation_factor(X2_no_const.values, i) for i in range(X2_no_const.shape[1])
]
print("VIF after dropping Median Age_y:\n", vif_data2)VIF after dropping Median Age_y:
feature VIF
0 black_ratio 3.544943
1 white_ratio 1.481318
2 avg_sale_price 1.034620
3 below_poverty_ratio 3.007876
4 Vacant Hou_y 4.123210
X_final = df[feature_cols_reduced].fillna(0).copy()
X_final["avg_sale_price"] = np.log1p(X_final["avg_sale_price"])
X_final = sm.add_constant(X_final)
y = df["crime_rate"].fillna(0)
model = sm.OLS(y, X_final, missing='drop')
results = model.fit()
print(results.summary())
X_no_const_final = X_final.drop(columns=["const"], errors="ignore")
vif_data_final = pd.DataFrame({
"feature": X_no_const_final.columns,
"VIF": [
variance_inflation_factor(X_no_const_final.values, i)
for i in range(X_no_const_final.shape[1])
]
})
print("Final VIF after removing high-collinearity vars:\n", vif_data_final) OLS Regression Results
==============================================================================
Dep. Variable: crime_rate R-squared: 0.047
Model: OLS Adj. R-squared: 0.034
Method: Least Squares F-statistic: 3.672
Date: Thu, 26 Dec 2024 Prob (F-statistic): 0.00296
Time: 11:59:36 Log-Likelihood: -3147.6
No. Observations: 376 AIC: 6307.
Df Residuals: 370 BIC: 6331.
Df Model: 5
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const -315.5345 341.930 -0.923 0.357 -987.904 356.835
black_ratio 267.0040 318.167 0.839 0.402 -358.639 892.647
white_ratio 760.4843 418.006 1.819 0.070 -61.481 1582.450
avg_sale_price 268.6425 149.241 1.800 0.073 -24.824 562.109
below_poverty_ratio 1.119e+04 2800.248 3.998 0.000 5688.368 1.67e+04
Vacant Hou_y -0.7413 0.416 -1.781 0.076 -1.560 0.077
==============================================================================
Omnibus: 785.338 Durbin-Watson: 1.975
Prob(Omnibus): 0.000 Jarque-Bera (JB): 930284.806
Skew: 14.623 Prob(JB): 0.00
Kurtosis: 244.919 Cond. No. 1.40e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.4e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Final VIF after removing high-collinearity vars:
feature VIF
0 black_ratio 3.537142
1 white_ratio 1.717726
2 avg_sale_price 1.478665
3 below_poverty_ratio 3.057857
4 Vacant Hou_y 4.204546
#it seems like housing price are far not the most important factor when influencing housing prices#Q2,Here we test the relateness of each category relative to saling and market priceimport geopandas as gpd
import statsmodels.api as sm
shp_path = "final/Philadelphia_ACS_2023.shp"
try:
tracts_gdf = gpd.read_file(shp_path)
print(f"Loaded {len(tracts_gdf)} Census Tracts from {shp_path}")
except FileNotFoundError:
raise FileNotFoundError(f"The file at {shp_path} does not exist. Please check the path.")
if tracts_gdf.crs != "EPSG:4326":
tracts_gdf = tracts_gdf.to_crs(epsg=4326)
tracts_gdf.head()Loaded 384 Census Tracts from final/Philadelphia_ACS_2023.shp
| GEOID | Total Popu | Median Age | White Alon | Black or A | Asian Alon | Hispanic o | Total Hous | Total Fami | Married-co | ... | Total Ho_1 | Occupied H | Vacant Hou | Owner-occu | Renter-occ | NAME | state | county | tract | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42101009802 | 6190.0 | 33.5 | 580.0 | 5341.0 | 35.0 | 149.0 | 2129.0 | 1467.0 | 576.0 | ... | 2371.0 | 2129.0 | 242.0 | 1593.0 | 536.0 | Census Tract 98.02, Philadelphia County, Penns... | 42 | 101 | 009802 | POLYGON ((-75.27547 39.97743, -75.27528 39.977... |
| 1 | 42101037500 | 3736.0 | 31.3 | 1672.0 | 1467.0 | 165.0 | 298.0 | 1224.0 | 733.0 | 547.0 | ... | 1383.0 | 1224.0 | 159.0 | 722.0 | 502.0 | Census Tract 375, Philadelphia County, Pennsyl... | 42 | 101 | 037500 | POLYGON ((-75.26551 39.98186, -75.26475 39.982... |
| 2 | 42101021900 | 1434.0 | 44.7 | 1307.0 | 50.0 | 24.0 | 41.0 | 654.0 | 307.0 | 238.0 | ... | 751.0 | 654.0 | 97.0 | 551.0 | 103.0 | Census Tract 219, Philadelphia County, Pennsyl... | 42 | 101 | 021900 | POLYGON ((-75.25438 40.04657, -75.25384 40.046... |
| 3 | 42101011300 | 3344.0 | 28.5 | 62.0 | 3229.0 | 0.0 | 53.0 | 1102.0 | 770.0 | 133.0 | ... | 1392.0 | 1102.0 | 290.0 | 621.0 | 481.0 | Census Tract 113, Philadelphia County, Pennsyl... | 42 | 101 | 011300 | POLYGON ((-75.23947 39.98111, -75.23942 39.981... |
| 4 | 42101021800 | 5141.0 | 30.9 | 2822.0 | 1497.0 | 247.0 | 178.0 | 2242.0 | 1073.0 | 887.0 | ... | 2394.0 | 2242.0 | 152.0 | 597.0 | 1645.0 | Census Tract 218, Philadelphia County, Pennsyl... | 42 | 101 | 021800 | POLYGON ((-75.23807 40.05988, -75.23622 40.061... |
5 rows × 23 columns
print("Number of crime records:", len(crime_gdf))
crime_gdf.head()
crime_gdf = crime_gdf.to_crs(epsg=3857)
tracts_gdf = tracts_gdf.to_crs(epsg=3857)
# 1) sjoin
crime_by_tract = gpd.sjoin(
crime_gdf,
tracts_gdf,
how="inner",
predicate="intersects"
).copy()
print("Records after sjoin:", len(crime_by_tract))
crime_by_tract.head()
print(crime_by_tract.columns)Number of crime records: 462960
Records after sjoin: 462145
Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist',
'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour',
'dc_key', 'location_block', 'ucr_general', 'text_general_code',
'point_x', 'point_y', 'geometry', 'year_month', 'crime_category',
'index_right', 'GEOID', 'Total Popu', 'Median Age', 'White Alon',
'Black or A', 'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami',
'Married-co', 'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1',
'Occupied H', 'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state',
'county', 'tract'],
dtype='object')
crime_counts = (crime_by_tract
.groupby(["GEOID","crime_category"])
.size()
.reset_index(name="count")
)
crime_counts.head()| GEOID | crime_category | count | |
|---|---|---|---|
| 0 | 42101000100 | Drug/Alcohol Crime | 16 |
| 1 | 42101000100 | Miscellaneous | 419 |
| 2 | 42101000100 | Other | 274 |
| 3 | 42101000100 | Property Crime | 1192 |
| 4 | 42101000100 | Sexual Offenses | 8 |
crime_counts_wide = crime_counts.pivot_table(
index="GEOID",
columns="crime_category",
values="count",
fill_value=0
).reset_index()
crime_counts_wide.columns.name = None
crime_counts_wide.head()| GEOID | Drug/Alcohol Crime | Miscellaneous | Other | Property Crime | Sexual Offenses | Violent Crime | |
|---|---|---|---|---|---|---|---|
| 0 | 42101000100 | 16.0 | 419.0 | 274.0 | 1192.0 | 8.0 | 138.0 |
| 1 | 42101000200 | 12.0 | 165.0 | 174.0 | 711.0 | 24.0 | 106.0 |
| 2 | 42101000300 | 11.0 | 413.0 | 433.0 | 942.0 | 19.0 | 128.0 |
| 3 | 42101000401 | 3.0 | 83.0 | 94.0 | 579.0 | 3.0 | 48.0 |
| 4 | 42101000402 | 40.0 | 601.0 | 509.0 | 1497.0 | 46.0 | 298.0 |
print("final_aggregated_data columns:\n", final_aggregated_data.columns)
print("Length of final_aggregated_data:", len(final_aggregated_data))
merged_df = crime_counts_wide.merge(
final_aggregated_data,
on="GEOID",
how="left"
)
merged_df = merged_df.fillna(0)
print("Merged shape:", merged_df.shape)
merged_df.head()final_aggregated_data columns:
Index(['GEOID', 'Total Ho_1', 'NAME', 'state', 'county', 'tract', 'geometry',
'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
'Owner-occu_y', 'Renter-occ_y'],
dtype='object')
Length of final_aggregated_data: 384
Merged shape: (384, 35)
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_12344\2015751945.py:10: DeprecationWarning: ExtensionArray.fillna added a 'copy' keyword in pandas 2.1.0. In a future version, ExtensionArray subclasses will need to implement this keyword or an exception will be raised. In the interim, the keyword is ignored by GeometryArray.
merged_df = merged_df.fillna(0)
| GEOID | Drug/Alcohol Crime | Miscellaneous | Other | Property Crime | Sexual Offenses | Violent Crime | Total Ho_1 | NAME | state | ... | Total Hous_y | Total Fami_y | Married-co_y | Poverty St_y | Below Pove_y | Median Hou_y | Occupied H_y | Vacant Hou_y | Owner-occu_y | Renter-occ_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42101000100 | 16.0 | 419.0 | 274.0 | 1192.0 | 8.0 | 138.0 | 2827.0 | Census Tract 1, Philadelphia County, Pennsylvania | 42 | ... | 2489.0 | 753.0 | 621.0 | 753.0 | 14.0 | 103585.0 | 2489.0 | 338.0 | 901.0 | 1588.0 |
| 1 | 42101000200 | 12.0 | 165.0 | 174.0 | 711.0 | 24.0 | 106.0 | 1219.0 | Census Tract 2, Philadelphia County, Pennsylvania | 42 | ... | 1082.0 | 503.0 | 402.0 | 503.0 | 55.0 | 49871.0 | 1082.0 | 137.0 | 523.0 | 559.0 |
| 2 | 42101000300 | 11.0 | 413.0 | 433.0 | 942.0 | 19.0 | 128.0 | 2063.0 | Census Tract 3, Philadelphia County, Pennsylvania | 42 | ... | 1828.0 | 588.0 | 516.0 | 588.0 | 16.0 | 86296.0 | 1828.0 | 235.0 | 441.0 | 1387.0 |
| 3 | 42101000401 | 3.0 | 83.0 | 94.0 | 579.0 | 3.0 | 48.0 | 1973.0 | Census Tract 4.01, Philadelphia County, Pennsy... | 42 | ... | 1618.0 | 437.0 | 414.0 | 437.0 | 50.0 | 62986.0 | 1618.0 | 355.0 | 491.0 | 1127.0 |
| 4 | 42101000402 | 40.0 | 601.0 | 509.0 | 1497.0 | 46.0 | 298.0 | 3448.0 | Census Tract 4.02, Philadelphia County, Pennsy... | 42 | ... | 2918.0 | 672.0 | 601.0 | 672.0 | 0.0 | 78947.0 | 2918.0 | 530.0 | 1679.0 | 1239.0 |
5 rows × 35 columns
def run_regression_crime_type(df, crime_col, housing_cols):
"""
df: merged dataframe
crime_col: e.g. "Violent Crime", "Property Crime", ...
housing_cols: list of columns to use as X
"""
y = df[crime_col].values
X = df[housing_cols].copy()
X = sm.add_constant(X, has_constant='add')
model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(f"Regression on {crime_col} ~ {housing_cols}")
print(results.summary())
print("-"*80)
return results
housing_features = ["avg_sale_price", "avg_rooms", "total_property_count"]crime_types = [
"Violent Crime",
"Property Crime",
"Drug/Alcohol Crime",
"Sexual Offenses",
"Miscellaneous"
]
results_dict = {}
for ctype in crime_types:
if ctype in merged_df.columns:
results = run_regression_crime_type(
merged_df,
ctype,
housing_features
)
results_dict[ctype] = results
else:
print(f"Column {ctype} not in merged_df, skipping...")Regression on Violent Crime ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.205
Model: OLS Adj. R-squared: 0.199
Method: Least Squares F-statistic: 32.64
Date: Thu, 26 Dec 2024 Prob (F-statistic): 8.60e-19
Time: 11:59:40 Log-Likelihood: -2209.9
No. Observations: 384 AIC: 4428.
Df Residuals: 380 BIC: 4444.
Df Model: 3
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 43.7517 9.639 4.539 0.000 24.799 62.704
avg_sale_price -0.2682 0.647 -0.415 0.679 -1.540 1.003
avg_rooms -55.5946 12.919 -4.303 0.000 -80.997 -30.192
total_property_count 0.0463 0.005 8.456 0.000 0.036 0.057
==============================================================================
Omnibus: 120.879 Durbin-Watson: 0.782
Prob(Omnibus): 0.000 Jarque-Bera (JB): 399.855
Skew: 1.411 Prob(JB): 1.49e-87
Kurtosis: 7.127 Cond. No. 5.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Property Crime ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.084
Model: OLS Adj. R-squared: 0.077
Method: Least Squares F-statistic: 11.67
Date: Thu, 26 Dec 2024 Prob (F-statistic): 2.48e-07
Time: 11:59:40 Log-Likelihood: -2852.2
No. Observations: 384 AIC: 5712.
Df Residuals: 380 BIC: 5728.
Df Model: 3
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 481.3326 51.328 9.378 0.000 380.410 582.255
avg_sale_price 0.2273 3.444 0.066 0.947 -6.544 6.999
avg_rooms -200.7619 68.794 -2.918 0.004 -336.027 -65.497
total_property_count 0.1431 0.029 4.909 0.000 0.086 0.200
==============================================================================
Omnibus: 254.406 Durbin-Watson: 1.620
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2874.925
Skew: 2.688 Prob(JB): 0.00
Kurtosis: 15.280 Cond. No. 5.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Drug/Alcohol Crime ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.040
Model: OLS Adj. R-squared: 0.033
Method: Least Squares F-statistic: 5.304
Date: Thu, 26 Dec 2024 Prob (F-statistic): 0.00136
Time: 11:59:40 Log-Likelihood: -2257.8
No. Observations: 384 AIC: 4524.
Df Residuals: 380 BIC: 4539.
Df Model: 3
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const -1.2540 10.919 -0.115 0.909 -22.724 20.216
avg_sale_price 0.0067 0.733 0.009 0.993 -1.434 1.447
avg_rooms -20.9467 14.635 -1.431 0.153 -49.723 7.829
total_property_count 0.0222 0.006 3.584 0.000 0.010 0.034
==============================================================================
Omnibus: 615.491 Durbin-Watson: 0.679
Prob(Omnibus): 0.000 Jarque-Bera (JB): 142748.835
Skew: 8.928 Prob(JB): 0.00
Kurtosis: 95.752 Cond. No. 5.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Sexual Offenses ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.025
Model: OLS Adj. R-squared: 0.017
Method: Least Squares F-statistic: 3.202
Date: Thu, 26 Dec 2024 Prob (F-statistic): 0.0233
Time: 11:59:40 Log-Likelihood: -1866.6
No. Observations: 384 AIC: 3741.
Df Residuals: 380 BIC: 3757.
Df Model: 3
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 8.0772 3.942 2.049 0.041 0.326 15.829
avg_sale_price -0.0336 0.265 -0.127 0.899 -0.554 0.487
avg_rooms -10.2522 5.284 -1.940 0.053 -20.642 0.137
total_property_count 0.0050 0.002 2.249 0.025 0.001 0.009
==============================================================================
Omnibus: 806.172 Durbin-Watson: 1.691
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1063695.011
Skew: 14.805 Prob(JB): 0.00
Kurtosis: 259.133 Cond. No. 5.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Miscellaneous ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.167
Model: OLS Adj. R-squared: 0.160
Method: Least Squares F-statistic: 25.37
Date: Thu, 26 Dec 2024 Prob (F-statistic): 5.58e-15
Time: 11:59:40 Log-Likelihood: -2421.9
No. Observations: 384 AIC: 4852.
Df Residuals: 380 BIC: 4868.
Df Model: 3
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 90.9089 16.741 5.430 0.000 57.992 123.826
avg_sale_price -0.4549 1.123 -0.405 0.686 -2.664 1.754
avg_rooms -75.0119 22.438 -3.343 0.001 -119.130 -30.894
total_property_count 0.0729 0.010 7.670 0.000 0.054 0.092
==============================================================================
Omnibus: 183.690 Durbin-Watson: 1.059
Prob(Omnibus): 0.000 Jarque-Bera (JB): 965.740
Skew: 2.029 Prob(JB): 1.96e-210
Kurtosis: 9.625 Cond. No. 5.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
crime_types = ["Violent Crime","Property Crime","Drug/Alcohol Crime","Sexual Offenses","Miscellaneous"]
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(20, 4), sharey=False)
for i, ctype in enumerate(crime_types):
ax = axes[i]
ax.scatter(merged_df["avg_sale_price"], merged_df[ctype], alpha=0.5)
ax.set_xlabel("Avg Sale Price")
ax.set_ylabel(ctype)
ax.set_title(f"{ctype} vs. Avg Sale Price")
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 6))
colors = ["red","blue","green","orange","purple"]
crime_types = ["Violent Crime","Property Crime","Drug/Alcohol Crime","Sexual Offenses","Miscellaneous"]
for ctype, color in zip(crime_types, colors):
plt.scatter(merged_df["avg_sale_price"], merged_df[ctype], alpha=0.5, color=color, label=ctype)
plt.xlabel("Avg Sale Price")
plt.ylabel("Crime Count")
plt.title("Crime Count vs. Avg Sale Price (All Types in One Plot)")
plt.legend()
plt.show()
#Q3 #lets have a peek of the data structure and data content
print("Columns in crime_gdf:", crime_gdf.columns.tolist())
print(crime_gdf.head())Columns in crime_gdf: ['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist', 'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour', 'dc_key', 'location_block', 'ucr_general', 'text_general_code', 'point_x', 'point_y', 'geometry', 'year_month', 'crime_category']
cartodb_id the_geom \
0 2 0101000020E6100000A51C8299A5C752C006342AD3DCFF...
1 4 0101000020E6100000F9245E3B64CC52C0B7195D940FF6...
2 7 0101000020E6100000118A52E7F6C052C0CFF41263190C...
3 123 0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02...
4 126 0101000020E6100000D1CCD5875CCA52C014B723FFC005...
the_geom_webmercator objectid dc_dist psa \
0 0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F... 114 25 3
1 0101000020110F00000426B7CE54EE5FC1C5E06D37E284... 116 01 1
2 0101000020110F00006728CED7EBDA5FC169DB64F8519D... 119 08 2
3 0101000020110F00009D28D4D968E25FC13CD5C3D06F92... 96 15 1
4 0101000020110F00002F28E30AE2EA5FC10090A3314796... 99 14 1
dispatch_date_time dispatch_date dispatch_time hour dc_key \
0 2023-03-11T17:12:00Z 2023-03-11 12:12:00 12.0 202325014482
1 2023-03-11T18:31:00Z 2023-03-11 13:31:00 13.0 202301004597
2 2023-03-11T22:13:00Z 2023-03-11 17:13:00 17.0 202308008412
3 2023-03-11T12:42:00Z 2023-03-11 07:42:00 7.0 202315017366
4 2023-03-12T00:54:00Z 2023-03-11 19:54:00 19.0 202314012625
location_block ucr_general text_general_code point_x \
0 3300 BLOCK HARTVILLE ST 300 Robbery No Firearm -75.119482
1 2400 BLOCK S 28TH ST 600 Theft from Vehicle -75.193618
2 9800 BLOCK Roosevelt Blvd 600 Thefts -75.015070
3 4700 BLOCK GRISCOM ST 600 Thefts -75.083953
4 5500 BLOCK BLOYD ST 300 Robbery No Firearm -75.161898
point_y geometry year_month crime_category
0 39.998927 POINT (-8362262.529 4865786.288) 2023-03 Violent Crime
1 39.922350 POINT (-8370515.230 4854664.866) 2023-03 Property Crime
2 40.094525 POINT (-8350639.372 4879687.881) 2023-03 Property Crime
3 40.017896 POINT (-8358307.404 4868543.262) 2023-03 Property Crime
4 40.044952 POINT (-8366984.170 4872476.776) 2023-03 Violent Crime
unique_cats = crime_gdf["crime_category"].unique()
print("Unique crime categories in crime_gdf:", unique_cats)
missing_cats = set(["Violent Crime","Property Crime","Drug/Alcohol Crime","Sexual Offenses","Miscellaneous"]) - set(unique_cats)
if missing_cats:
print("Warning: These categories not found in data:", missing_cats)
else:
print("All 5 categories are present!")Unique crime categories in crime_gdf: ['Violent Crime' 'Property Crime' 'Sexual Offenses' 'Other'
'Drug/Alcohol Crime' 'Miscellaneous']
All 5 categories are present!
# convert type
crime_gdf["year_month"] = crime_gdf["dispatch_date"].dt.to_period("M").astype(str)
crime_gdf["day_of_week"] = crime_gdf["dispatch_date"].dt.dayofweek
# hour (which does not exist)
crime_gdf["hour_of_day"] = crime_gdf["dispatch_date"].dt.hour
# days
print("Unique day_of_week:", sorted(crime_gdf["day_of_week"].unique()))
print("Unique hour_of_day:", sorted(crime_gdf["hour_of_day"].unique()))
print("Total records in crime_gdf:", len(crime_gdf))
missing_date = crime_gdf["dispatch_date"].isna().sum()
print("Missing dispatch_date rows:", missing_date)
missing_cat = crime_gdf["crime_category"].isna().sum()
print("Missing crime_category rows:", missing_cat)
group_month_cat = crime_gdf.groupby(["year_month","crime_category"]).size().reset_index(name="count")
print("First few lines of month-cat pivot:")
print(group_month_cat.head(10).to_string())
# Month
unique_months = group_month_cat["year_month"].unique()
print(f"Found {len(unique_months)} distinct year_month values.")Unique day_of_week: [0, 1, 2, 3, 4, 5, 6]
Unique hour_of_day: [0]
Total records in crime_gdf: 462960
Missing dispatch_date rows: 0
Missing crime_category rows: 0
First few lines of month-cat pivot:
year_month crime_category count
0 2022-01 Drug/Alcohol Crime 257
1 2022-01 Miscellaneous 1542
2 2022-01 Other 1752
3 2022-01 Property Crime 5608
4 2022-01 Sexual Offenses 123
5 2022-01 Violent Crime 1234
6 2022-02 Drug/Alcohol Crime 335
7 2022-02 Miscellaneous 1748
8 2022-02 Other 1840
9 2022-02 Property Crime 5295
Found 36 distinct year_month values.
group_dow_cat = crime_gdf.groupby(["day_of_week","crime_category"]).size().reset_index(name="count")
print(group_dow_cat.head(14))
group_hour_cat = crime_gdf.groupby(["hour_of_day","crime_category"]).size().reset_index(name="count")
print(group_hour_cat.head(10))
group_hour_cat = crime_gdf.groupby(["hour_of_day","crime_category"]).size().reset_index(name="count")
print(group_hour_cat.head(10)) day_of_week crime_category count
0 0 Drug/Alcohol Crime 1184
1 0 Miscellaneous 10568
2 0 Other 11818
3 0 Property Crime 41281
4 0 Sexual Offenses 806
5 0 Violent Crime 6115
6 1 Drug/Alcohol Crime 1830
7 1 Miscellaneous 12314
8 1 Other 11492
9 1 Property Crime 39193
10 1 Sexual Offenses 853
11 1 Violent Crime 5773
12 2 Drug/Alcohol Crime 2020
13 2 Miscellaneous 12228
hour_of_day crime_category count
0 0 Drug/Alcohol Crime 11239
1 0 Miscellaneous 72893
2 0 Other 76261
3 0 Property Crime 256726
4 0 Sexual Offenses 5416
5 0 Violent Crime 40425
hour_of_day crime_category count
0 0 Drug/Alcohol Crime 11239
1 0 Miscellaneous 72893
2 0 Other 76261
3 0 Property Crime 256726
4 0 Sexual Offenses 5416
5 0 Violent Crime 40425
#good! It seems we have month and week level data, tho not hours, but this is enough to investigatemonthly_counts = crime_gdf.groupby(["year_month","crime_category"]).size().reset_index(name="count")
weekly_counts = crime_gdf.groupby(["day_of_week","crime_category"]).size().reset_index(name="count")import altair as alt
def create_time_charts(selected_category):
if selected_category == "All":
df_month = monthly_counts.groupby("year_month")["count"].sum().reset_index()
df_week = weekly_counts.groupby("day_of_week")["count"].sum().reset_index()
else:
df_month = monthly_counts[monthly_counts["crime_category"]==selected_category]
df_week = weekly_counts[weekly_counts["crime_category"]==selected_category]
month_chart = (
alt.Chart(df_month)
.mark_line(point=True)
.encode(
x=alt.X("year_month:N", sort=None, title="Year-Month"),
y=alt.Y("count:Q", title="Crime Count"),
tooltip=["year_month","count"]
)
.properties(
width=600,
height=300,
title=f"Monthly Trend - {selected_category}"
)
)
day_map = {0:"Mon",1:"Tue",2:"Wed",3:"Thu",4:"Fri",5:"Sat",6:"Sun"}
df_week["day_name"] = df_week["day_of_week"].map(day_map)
week_chart = (
alt.Chart(df_week)
.mark_bar()
.encode(
x=alt.X("day_name:N", sort=["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]),
y=alt.Y("count:Q", title="Crime Count"),
tooltip=["day_name","count"]
)
.properties(
width=300,
height=300,
title=f"Weekly Distribution - {selected_category}"
)
)
return month_chart, week_chartimport panel as pn
crime_types_all = ["All"] + sorted(crime_gdf["crime_category"].unique().tolist())
category_selector = pn.widgets.Select(
name="Crime Category",
options=crime_types_all,
value="All"
)
@pn.depends(category_selector.param.value)
def update_charts(selected_category):
mchart, wchart = create_time_charts(selected_category)
return pn.Row(
pn.pane.Vega(mchart, width=650, height=350),
pn.pane.Vega(wchart, width=350, height=350)
)
time_dashboard = pn.Column(
"# Crime Monthly & Weekly Distribution",
"Select a crime category to see monthly trend & weekly distribution",
category_selector,
update_charts
)
time_dashboard.servable()import holoviews as hv
import hvplot.pandas
hv.extension('bokeh')crime_gdf.crs = "EPSG:3857"
crime_gdf = crime_gdf[crime_gdf.geometry.notnull()]
crime_gdf["x"] = crime_gdf.geometry.x.astype(float)
crime_gdf["y"] = crime_gdf.geometry.y.astype(float)
crime_gdf = crime_gdf.replace([np.inf, -np.inf], np.nan)
crime_gdf = crime_gdf.dropna(subset=["x","y"])
crime_gdf.geometry.head()0 POINT (-8362262.529 4865786.288)
1 POINT (-8370515.230 4854664.866)
2 POINT (-8350639.372 4879687.881)
3 POINT (-8358307.404 4868543.262)
4 POINT (-8366984.170 4872476.776)
Name: geometry, dtype: geometry
crime_gdf.head(20)| cartodb_id | the_geom | the_geom_webmercator | objectid | dc_dist | psa | dispatch_date_time | dispatch_date | dispatch_time | hour | ... | text_general_code | point_x | point_y | geometry | year_month | crime_category | day_of_week | hour_of_day | x | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 0101000020E6100000A51C8299A5C752C006342AD3DCFF... | 0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F... | 114 | 25 | 3 | 2023-03-11T17:12:00Z | 2023-03-11 | 12:12:00 | 12.0 | ... | Robbery No Firearm | -75.119482 | 39.998927 | POINT (-8362262.529 4865786.288) | 2023-03 | Violent Crime | 5 | 0 | -8.362263e+06 | 4.865786e+06 |
| 1 | 4 | 0101000020E6100000F9245E3B64CC52C0B7195D940FF6... | 0101000020110F00000426B7CE54EE5FC1C5E06D37E284... | 116 | 01 | 1 | 2023-03-11T18:31:00Z | 2023-03-11 | 13:31:00 | 13.0 | ... | Theft from Vehicle | -75.193618 | 39.922350 | POINT (-8370515.230 4854664.866) | 2023-03 | Property Crime | 5 | 0 | -8.370515e+06 | 4.854665e+06 |
| 2 | 7 | 0101000020E6100000118A52E7F6C052C0CFF41263190C... | 0101000020110F00006728CED7EBDA5FC169DB64F8519D... | 119 | 08 | 2 | 2023-03-11T22:13:00Z | 2023-03-11 | 17:13:00 | 17.0 | ... | Thefts | -75.015070 | 40.094525 | POINT (-8350639.372 4879687.881) | 2023-03 | Property Crime | 5 | 0 | -8.350639e+06 | 4.879688e+06 |
| 3 | 123 | 0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02... | 0101000020110F00009D28D4D968E25FC13CD5C3D06F92... | 96 | 15 | 1 | 2023-03-11T12:42:00Z | 2023-03-11 | 07:42:00 | 7.0 | ... | Thefts | -75.083953 | 40.017896 | POINT (-8358307.404 4868543.262) | 2023-03 | Property Crime | 5 | 0 | -8.358307e+06 | 4.868543e+06 |
| 4 | 126 | 0101000020E6100000D1CCD5875CCA52C014B723FFC005... | 0101000020110F00002F28E30AE2EA5FC10090A3314796... | 99 | 14 | 1 | 2023-03-12T00:54:00Z | 2023-03-11 | 19:54:00 | 19.0 | ... | Robbery No Firearm | -75.161898 | 40.044952 | POINT (-8366984.170 4872476.776) | 2023-03 | Violent Crime | 5 | 0 | -8.366984e+06 | 4.872477e+06 |
| 5 | 128 | 0101000020E61000002B7C851E94C252C0FB6A79ABCF03... | 0101000020110F000050B036BBA9DD5FC1A20DA3831F94... | 101 | 15 | 3 | 2023-03-11T18:53:00Z | 2023-03-11 | 13:53:00 | 13.0 | ... | Thefts | -75.040290 | 40.029775 | POINT (-8353446.925 4870270.057) | 2023-03 | Property Crime | 5 | 0 | -8.353447e+06 | 4.870270e+06 |
| 6 | 129 | 0101000020E6100000C71530E485C852C03A8354C44800... | 0101000020110F000023CB499DC2E75FC1CD36223F3690... | 102 | 25 | 3 | 2023-03-11T07:03:00Z | 2023-03-11 | 02:03:00 | 2.0 | ... | Aggravated Assault Firearm | -75.133172 | 40.002221 | POINT (-8363786.458 4866264.986) | 2023-03 | Violent Crime | 5 | 0 | -8.363786e+06 | 4.866265e+06 |
| 7 | 138 | 0101000020E6100000E26C2165D7BE52C0EE405BD61608... | 0101000020110F000097CD95A350D75FC182830489DE98... | 110 | 08 | 2 | 2023-03-11T12:22:00Z | 2023-03-11 | 07:22:00 | 7.0 | ... | Theft from Vehicle | -74.981897 | 40.063197 | POINT (-8346946.556 4875130.141) | 2023-03 | Property Crime | 5 | 0 | -8.346947e+06 | 4.875130e+06 |
| 8 | 146 | 0101000020E6100000ABAA7E4249CF52C0CAFACDC474F6... | 0101000020110F000001F690843FF35FC18CE049475285... | 284 | 12 | 2 | 2023-02-26T14:40:00Z | 2023-02-26 | 09:40:00 | 9.0 | ... | Thefts | -75.238846 | 39.925439 | POINT (-8375550.071 4855113.114) | 2023-02 | Property Crime | 6 | 0 | -8.375550e+06 | 4.855113e+06 |
| 9 | 149 | 0101000020E6100000DF2A99ADC6CE52C00BF379200DFD... | 0101000020110F0000EDBF38B661F25FC1DC5ECECBA08C... | 287 | 19 | 2 | 2023-02-26T22:52:00Z | 2023-02-26 | 17:52:00 | 17.0 | ... | Thefts | -75.230876 | 39.976963 | POINT (-8374662.847 4862595.184) | 2023-02 | Property Crime | 6 | 0 | -8.374663e+06 | 4.862595e+06 |
| 10 | 152 | 0101000020E6100000093D93E496CF52C0EDE0D4C575FC... | 0101000020110F00007C0FB162C3F35FC1759C000EF98B... | 290 | 19 | 2 | 2023-02-26T05:31:00Z | 2023-02-26 | 00:31:00 | 0.0 | ... | Theft from Vehicle | -75.243585 | 39.972344 | POINT (-8376077.542 4861924.219) | 2023-02 | Property Crime | 6 | 0 | -8.376078e+06 | 4.861924e+06 |
| 11 | 153 | 0101000020E6100000D50648B048CB52C0E395DA41DBFC... | 0101000020110F000017AE3E2E73EC5FC137E49986698C... | 291 | 22 | 4 | 2023-02-26T18:35:00Z | 2023-02-26 | 13:35:00 | 13.0 | ... | Theft from Vehicle | -75.176312 | 39.975441 | POINT (-8368588.723 4862374.103) | 2023-02 | Property Crime | 6 | 0 | -8.368589e+06 | 4.862374e+06 |
| 12 | 159 | 0101000020E6100000E753850E13CC52C01D586D8298FD... | 0101000020110F000062125BECCAED5FC1E2F5A9473B8D... | 297 | 22 | 4 | 2023-02-26T18:27:00Z | 2023-02-26 | 13:27:00 | 13.0 | ... | Theft from Vehicle | -75.188663 | 39.981217 | POINT (-8369963.693 4863213.120) | 2023-02 | Property Crime | 6 | 0 | -8.369964e+06 | 4.863213e+06 |
| 13 | 295 | 0101000020E6100000DACF9CD4DBCB52C0E03C286A61F7... | 0101000020110F00007325B21D6DED5FC17D9F205F5886... | 213 | 17 | 2 | 2023-02-26T05:46:00Z | 2023-02-26 | 00:46:00 | 0.0 | ... | Aggravated Assault Firearm | -75.185292 | 39.932660 | POINT (-8369588.464 4856161.486) | 2023-02 | Violent Crime | 6 | 0 | -8.369588e+06 | 4.856161e+06 |
| 14 | 298 | 0101000020E6100000F9EBA1BFC8C952C05482518B39F5... | 0101000020110F0000A5891505E7E95FC1D8A22933F583... | 216 | 03 | 2 | 2023-02-26T09:18:00Z | 2023-02-26 | 04:18:00 | 4.0 | ... | Aggravated Assault No Firearm | -75.152878 | 39.915819 | POINT (-8365980.079 4853716.799) | 2023-02 | Violent Crime | 6 | 0 | -8.365980e+06 | 4.853717e+06 |
| 15 | 300 | 0101000020E6100000DE731E1DF0CA52C0A35E2A15D8F4... | 0101000020110F0000352233BADCEB5FC1FAC9FC478983... | 218 | 03 | 3 | 2023-02-26T21:31:00Z | 2023-02-26 | 16:31:00 | 16.0 | ... | Theft from Vehicle | -75.170905 | 39.912844 | POINT (-8367986.909 4853285.125) | 2023-02 | Property Crime | 6 | 0 | -8.367987e+06 | 4.853285e+06 |
| 16 | 301 | 0101000020E61000003AA2CE1E60CB52C09D4C9A0E3604... | 0101000020110F00009E3458FB9AEC5FC18ED7CF149194... | 219 | 14 | 3 | 2023-02-26T21:23:00Z | 2023-02-26 | 16:23:00 | 16.0 | ... | Thefts | -75.177742 | 40.032900 | POINT (-8368747.927 4870724.325) | 2023-02 | Property Crime | 6 | 0 | -8.368748e+06 | 4.870724e+06 |
| 17 | 303 | 0101000020E61000009D593B1F58C952C0D52F09FED403... | 0101000020110F0000A1A859B627E95FC14653F26A2594... | 221 | 35 | 2 | 2023-02-27T00:41:00Z | 2023-02-26 | 19:41:00 | 19.0 | ... | Thefts | -75.146004 | 40.029938 | POINT (-8365214.849 4870293.671) | 2023-02 | Property Crime | 6 | 0 | -8.365215e+06 | 4.870294e+06 |
| 18 | 322 | 0101000020E6100000A4A812E9E7CB52C041D910F397F5... | 0101000020110F0000B60B8DA281ED5FC1A7D720BD5D84... | 240 | 01 | 2 | 2023-02-27T02:22:00Z | 2023-02-26 | 21:22:00 | 21.0 | ... | Thefts | -75.186030 | 39.918700 | POINT (-8369670.540 4854134.955) | 2023-02 | Property Crime | 6 | 0 | -8.369671e+06 | 4.854135e+06 |
| 19 | 327 | 0101000020E6100000812040860EC752C0172A628519FF... | 0101000020110F00008ADE100445E55FC132D1110EE68E... | 245 | 24 | 1 | 2023-02-26T18:38:00Z | 2023-02-26 | 13:38:00 | 13.0 | ... | Thefts | -75.110261 | 39.992966 | POINT (-8361236.064 4864920.220) | 2023-02 | Property Crime | 6 | 0 | -8.361236e+06 | 4.864920e+06 |
20 rows × 23 columns
import holoviews as hv
import hvplot.pandas
hv.extension('bokeh')
import datashader as ds
import numpy as np
# months & cats
months_sorted = sorted(crime_gdf["year_month"].unique().tolist())
cats_sorted = sorted(crime_gdf["crime_category"].unique().tolist())
df_hv = crime_gdf[["year_month","crime_category","x","y"]].copy()
# monthly_counts for bar chart
monthly_counts = (
crime_gdf.groupby(["year_month","crime_category"])
.size().reset_index(name="count")
)
# color_key for multiple categories
color_key_dict = {
"Violent Crime": "#e31a1c",
"Property Crime": "#1f78b4",
"Drug/Alcohol Crime":"#33a02c",
"Sexual Offenses": "#ff7f00",
"Miscellaneous": "#6a3d9a",
"Other": "#b15928"
}
month_selector = pn.widgets.Select(
name="Month",
options=months_sorted,
value=months_sorted[0]
)
crime_selector = pn.widgets.Select(
name="Crime Category",
options=["All"] + cats_sorted,
value="Violent Crime"
)
def create_bar_chart(selected_month, selected_crime):
if selected_crime == "All":
chart_df = monthly_counts.groupby("year_month", as_index=False)["count"].sum()
else:
chart_df = monthly_counts[monthly_counts["crime_category"]==selected_crime]
highlight = alt.condition(
alt.datum.year_month == selected_month,
alt.value("orange"),
alt.value("gray")
)
bar = (
alt.Chart(chart_df)
.mark_bar()
.encode(
x=alt.X("year_month:N", sort=None, title="Month"),
y=alt.Y("count:Q", title="Crime Count"),
color=highlight,
tooltip=["year_month","count"]
)
.properties(
width=600,
height=250,
title=f"Monthly Distribution for '{selected_crime}'"
)
)
return bar
@pn.depends(month_selector, crime_selector)
def update_map(selected_month, selected_crime):
if selected_crime == "All":
sub_df = df_hv[df_hv["year_month"] == selected_month]
aggregator = ds.count_cat("crime_category")
color_args = dict(color_key=color_key_dict)
else:
sub_df = df_hv[
(df_hv["year_month"] == selected_month) &
(df_hv["crime_category"] == selected_crime)
]
aggregator = ds.count()
color_args = dict(cmap=["red"])
if sub_df.empty:
return hv.Text(0,0,f"No data for {selected_month}, {selected_crime}").opts(
width=300, height=200
)
map_plot = sub_df.hvplot.points(
x="x",
y="y",
crs="EPSG:3857",
geo=True,
tiles="CartoLight",
datashade=True,
dynspread=True,
aggregator=aggregator,
width=700,
height=450,
project=False,
**color_args
)
return map_plot
@pn.depends(month_selector, crime_selector)
def combined_dashboard(selected_month, selected_crime):
map_ = update_map(selected_month, selected_crime)
bar_chart = create_bar_chart(selected_month, selected_crime)
bar_pane = pn.pane.Vega(bar_chart, width=650, height=300)
return pn.Column(map_, bar_pane)
app = pn.Column(
"# Crime Distribution (Light Gray Map) with Legend & Bar Chart",
pn.Row(month_selector, crime_selector),
combined_dashboard
)
app.servable()